diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d72601a404673..7906e0ee9d785 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2026,6 +2026,8 @@ def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">; +def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">; + def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<(all_of Feature16BitInsts)>; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index de8f0f9cd62c3..9e470e27272c3 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -866,45 +866,74 @@ def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgprImm : ComplexPattern; -multiclass SMRD_Pattern { +class SMRDAlignedLoadPat : PatFrag <(ops node:$ptr), (Op node:$ptr), [{ + // Returns true if it is a single dword load or naturally aligned multi-dword load. + LoadSDNode *Ld = cast(N); + unsigned Size = Ld->getMemoryVT().getStoreSize(); + return Size <= 4 || Ld->getAlign().value() >= Size; +}]> { + let GISelPredicateCode = [{ + auto &Ld = cast(MI); + TypeSize Size = Ld.getMMO().getSize().getValue(); + return Size <= 4 || Ld.getMMO().getAlign().value() >= Size; + }]; +} + +def aligned_smrd_load : SMRDAlignedLoadPat; +multiclass SMRD_Patterns { // 1. IMM offset def : GCNPat < - (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_IMM") $sbase, $offset, 0)) - >; + (frag (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast(Instr#"_IMM"#suffix) $sbase, $offset, 0))>; // 2. 32-bit IMM offset on CI if immci then def : GCNPat < - (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_IMM_ci") $sbase, $offset, 0))> { - let OtherPredicates = [isGFX7Only]; + (frag (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast(Instr#"_IMM_ci"#suffix) $sbase, $offset, 0))> { + let SubtargetPredicate = isGFX7Only; } // 3. SGPR offset def : GCNPat < - (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast(Instr#"_SGPR") $sbase, $soffset, 0))> { - let OtherPredicates = [isNotGFX9Plus]; + (frag (SMRDSgpr i64:$sbase, i32:$soffset)), + (vt (!cast(Instr#"_SGPR"#suffix) $sbase, $soffset, 0))> { + let SubtargetPredicate = isNotGFX9Plus; } def : GCNPat < - (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> { - let OtherPredicates = [isGFX9Plus]; + (frag (SMRDSgpr i64:$sbase, i32:$soffset)), + (vt (!cast(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> { + let SubtargetPredicate = isGFX9Plus; } // 4. SGPR+IMM offset def : GCNPat < - (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> { - let OtherPredicates = [isGFX9Plus]; + (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), + (vt (!cast(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> { + let SubtargetPredicate = isGFX9Plus; } // 5. No offset def : GCNPat < - (vt (smrd_load (i64 SReg_64:$sbase))), - (vt (!cast(Instr#"_IMM") i64:$sbase, 0, 0)) - >; + (vt (frag (i64 SReg_64:$sbase))), + (vt (!cast(Instr#"_IMM"#suffix) i64:$sbase, 0, 0))>; +} + +multiclass SMRD_Pattern { + // High priority when XNACK is enabled and the load was naturally aligned. + let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 102 in + defm: SMRD_Patterns ; + + // XNACK is enabled and the load wasn't naturally aligned. The constrained sload variant. + if !gt(vt.Size, 32) then { + let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 101 in + defm: SMRD_Patterns ; + } + + // XNACK is disabled. + let AddedComplexity = 100 in + defm: SMRD_Patterns ; } multiclass SMLoad_Pattern { @@ -1018,6 +1047,8 @@ defm : ScalarBufferLoadIntrinsicPat ; defm : ScalarBufferLoadIntrinsicPat ; defm : ScalarBufferLoadIntrinsicPat ; +} // End let AddedComplexity = 100 + foreach vt = Reg32Types.types in { defm : SMRD_Pattern <"S_LOAD_DWORD", vt>; } @@ -1042,7 +1073,6 @@ foreach vt = SReg_512.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>; } -} // End let AddedComplexity = 100 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 453b229bf62bd..7525e00e6f401 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1021,20 +1021,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -1044,20 +1044,20 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -1067,20 +1067,20 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1761,19 +1761,19 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_endpgm main_body: @@ -1842,19 +1842,19 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_endpgm main_body: @@ -1884,19 +1884,19 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir index 5d4816812e6c0..6a1e52cd29fd9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11 --- name: fract_f64_neg @@ -12,23 +12,41 @@ body: | bb.1: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: fract_f64_neg - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) - ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 + ; GFX10-LABEL: name: fract_f64_neg + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3 + ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX11-LABEL: name: fract_f64_neg + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) + ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; GFX11-NEXT: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 %8:sgpr(p4) = G_PTR_ADD %2, %7(s64) @@ -60,23 +78,41 @@ body: | bb.1: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: fract_f64_neg_abs - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) - ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 + ; GFX10-LABEL: name: fract_f64_neg_abs + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3 + ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX11-LABEL: name: fract_f64_neg_abs + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) + ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]] + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; GFX11-NEXT: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 %8:sgpr(p4) = G_PTR_ADD %2, %7(s64) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir index 504f7697a0fcc..02c6220ac2aca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -3,7 +3,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s --- @@ -44,6 +44,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 4) $sgpr0 = COPY %1 @@ -89,6 +96,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_v2s16_from_4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4) $sgpr0 = COPY %1 @@ -133,6 +147,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; + ; GFX11-LABEL: name: load_constant_v2s32 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -174,8 +195,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v2s32_align4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -217,8 +245,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v4s16_align4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -261,8 +296,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v4s32_align4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 @@ -307,6 +349,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s64 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -349,8 +398,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_s64_align4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -393,8 +449,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v2s64 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 @@ -439,6 +502,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>) + ; + ; GFX11-LABEL: name: load_constant_v2p1 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>) %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 @@ -483,6 +553,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128) + ; + ; GFX11-LABEL: name: load_constant_s128_align4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128) %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 @@ -527,6 +604,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_p3_from_4 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 4) $sgpr0 = COPY %1 @@ -571,6 +655,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; + ; GFX11-LABEL: name: load_constant_p4_from_8 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(p4) = G_LOAD %0 :: (load (p4), align 8, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -615,6 +706,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999) + ; + ; GFX11-LABEL: name: load_constant_p999_from_8 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999) %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -659,6 +757,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>) + ; + ; GFX11-LABEL: name: load_constant_v2p3 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>) %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -703,6 +808,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_v2s16 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4) $sgpr0 = COPY %1 @@ -747,6 +859,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4) ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] + ; + ; GFX11-LABEL: name: load_constant_v4s16 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 4) $sgpr0_sgpr1 = COPY %1 @@ -789,8 +908,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v8s16 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 @@ -833,8 +959,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]] + ; GFX10-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v8s32 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1 @@ -877,8 +1010,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]] + ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v16s32 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1 @@ -921,8 +1061,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]] + ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1 + ; + ; GFX11-LABEL: name: load_constant_v8s64 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4) + ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1 @@ -971,6 +1118,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1020 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 1020 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -1018,6 +1172,13 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1024 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 1024 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -1067,6 +1228,14 @@ body: | ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575 ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048575 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575 + ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 1048575 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -1116,6 +1285,14 @@ body: | ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048576 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 + ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 1048576 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -1166,6 +1343,14 @@ body: | ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823 ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1073741823 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823 + ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 1073741823 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -1244,6 +1429,21 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_1 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -1 %2:sgpr(p4) = G_PTR_ADD %0, %1 @@ -1322,6 +1522,21 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] + ; + ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_524288 + ; GFX11: liveins: $sgpr0_sgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) + ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -524288 %2:sgpr(p4) = G_PTR_ADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 90f34acaa17aa..f88125ea02937 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -48,16 +48,16 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_1: @@ -133,16 +133,16 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_2: @@ -1266,14 +1266,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_num: @@ -1337,14 +1337,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_den: @@ -1416,17 +1416,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_fabs_num: @@ -1508,17 +1508,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_fabs_den: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index aa21e67544d65..4946d3759c2ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -63,14 +63,14 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX10-LABEL: mov_dpp64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; encoding: [0x01,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] ; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX11-LABEL: mov_dpp64_test: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 746bd739644a9..1e8209bd3fc6b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -64,16 +64,16 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppi64_test: @@ -120,16 +120,16 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppf64_test: @@ -288,16 +288,16 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p0_test: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 577a7d0b4cba0..489f46d1237a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -7,18 +7,18 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index b0f3eee3c7363..42f1bf84c0420 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2559,47 +2559,47 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_zext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX8-NEXT: s_mulk_i32 s2, 0x50 -; GFX8-NEXT: v_readfirstlane_b32 s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: s_mulk_i32 s0, 0x50 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_zext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 -; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_mul_i32 s0, s1, 0x50 +; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_zext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s3, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_mul_i32 s0, s1, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_zext_with_sregs: @@ -2738,56 +2738,56 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_sext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX8-NEXT: s_ashr_i32 s3, s2, 31 -; GFX8-NEXT: s_mulk_i32 s2, 0x50 -; GFX8-NEXT: s_mulk_i32 s3, 0x50 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_add_u32 s3, s3, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: s_mulk_i32 s0, 0x50 +; GFX8-NEXT: s_mulk_i32 s1, 0x50 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 -; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: s_add_u32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 +; GFX9-NEXT: s_mul_i32 s0, s1, 0x50 +; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: s_add_u32 s1, s2, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_sext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s3, s2, 31 -; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50 -; GFX10-NEXT: s_mulk_i32 s3, 0x50 -; GFX10-NEXT: s_mulk_i32 s2, 0x50 -; GFX10-NEXT: s_add_i32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50 +; GFX10-NEXT: s_mulk_i32 s1, 0x50 +; GFX10-NEXT: s_mulk_i32 s0, 0x50 +; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_sext_with_sregs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 3bc9a582ebd96..7d7f450e590fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -305,25 +305,25 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s9, 31 -; GFX9-NEXT: s_ashr_i32 s12, s11, 31 -; GFX9-NEXT: s_add_u32 s0, s8, s2 -; GFX9-NEXT: s_addc_u32 s1, s9, s2 -; GFX9-NEXT: s_add_u32 s8, s10, s12 -; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_addc_u32 s9, s11, s12 -; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_ashr_i32 s2, s13, 31 +; GFX9-NEXT: s_ashr_i32 s4, s15, 31 +; GFX9-NEXT: s_add_u32 s0, s12, s2 +; GFX9-NEXT: s_addc_u32 s1, s13, s2 +; GFX9-NEXT: s_add_u32 s6, s14, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s7, s15, s4 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s14, 0, s8 -; GFX9-NEXT: s_subb_u32 s15, 0, s9 +; GFX9-NEXT: s_sub_u32 s14, 0, s6 +; GFX9-NEXT: s_subb_u32 s15, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -357,7 +357,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] @@ -382,52 +382,52 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s11, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s12, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, s13, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc @@ -442,7 +442,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s9, 31 -; GFX10-NEXT: s_ashr_i32 s12, s11, 31 -; GFX10-NEXT: s_add_u32 s0, s8, s2 -; GFX10-NEXT: s_addc_u32 s1, s9, s2 -; GFX10-NEXT: s_add_u32 s8, s10, s12 -; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: s_addc_u32 s9, s11, s12 +; GFX10-NEXT: s_ashr_i32 s2, s13, 31 +; GFX10-NEXT: s_ashr_i32 s4, s15, 31 +; GFX10-NEXT: s_add_u32 s0, s12, s2 +; GFX10-NEXT: s_addc_u32 s1, s13, s2 +; GFX10-NEXT: s_add_u32 s6, s14, s4 +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_addc_u32 s7, s15, s4 ; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX10-NEXT: s_sub_u32 s10, 0, s8 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX10-NEXT: s_sub_u32 s12, 0, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -484,11 +484,12 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s11, s10, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2] -; GFX10-NEXT: s_subb_u32 s11, 0, s9 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s13, s12, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2] +; GFX10-NEXT: s_subb_u32 s13, 0, s7 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2] +; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 @@ -510,28 +511,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s10, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s12, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2] ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 ; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v2, s10, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v6, s12, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v2, s12, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0 @@ -540,71 +541,70 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3 +; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v0, s10, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v0, s12, v5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_co_u32 v0, s10, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v5, s10, v0, v2 +; GFX10-NEXT: v_add_co_u32 v0, s12, v0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v5, s12, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v5, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s12, s6, v5, 0 ; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2] ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s8 +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v9 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_xor_b32_e32 v2, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX10-NEXT: v_xor_b32_e32 v5, s2, v0 ; GFX10-NEXT: v_xor_b32_e32 v6, s2, v1 -; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s4 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v5, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 9ee0acf2aa2db..8ea5fc25d95de 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -92,8 +92,8 @@ entry: ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 -; GFX9_10: s_add_u32 s2, s2, -4 -; GFX9_10: s_addc_u32 s3, s3, -1 +; GFX9_10: s_add_u32 s0, s6, -4 +; GFX9_10: s_addc_u32 s1, s7, -1 ; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 0389cacb61390..5aef667934709 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -251,12 +251,12 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX9-NEXT: s_sub_u32 s2, 0, s10 -; GFX9-NEXT: s_subb_u32 s3, 0, s11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX9-NEXT: s_sub_u32 s2, 0, s14 +; GFX9-NEXT: s_subb_u32 s3, 0, s15 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] @@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6 -; GFX9-NEXT: v_sub_u32_e32 v0, s9, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6 +; GFX9-NEXT: v_sub_u32_e32 v0, s13, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc @@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX10-NEXT: s_sub_u32 s0, 0, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX10-NEXT: s_sub_u32 s0, 0, s14 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2] -; GFX10-NEXT: s_subb_u32 s1, 0, s11 +; GFX10-NEXT: s_subb_u32 s1, 0, s15 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 @@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, s13, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v5, s9, v1 +; GFX10-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX10-NEXT: v_mul_lo_u32 v5, s13, v1 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX10-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s9, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, s13, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v5, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v5, 0 ; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2] ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7 +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s12, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s13, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s10 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v9 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v8 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s10 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s14 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo @@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0 -; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = udiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 @@ -1248,16 +1248,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 -; GFX9-NEXT: s_sub_u32 s2, 0, s12 -; GFX9-NEXT: s_subb_u32 s3, 0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16 +; GFX9-NEXT: s_sub_u32 s2, 0, s16 +; GFX9-NEXT: s_subb_u32 s3, 0, s17 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1293,12 +1293,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s14 +; GFX9-NEXT: s_sub_u32 s2, 0, s18 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_subb_u32 s3, 0, s15 +; GFX9-NEXT: s_subb_u32 s3, 0, s19 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1317,48 +1317,47 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX9-NEXT: v_mul_hi_u32 v5, s13, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, s17 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s13 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v1 ; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s19 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s18 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s16, v2 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc @@ -1370,13 +1369,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] ; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 @@ -1385,7 +1384,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 ; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s16, v11 ; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 @@ -1441,55 +1440,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, s15, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, s14, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5 +; GFX9-NEXT: v_mul_hi_u32 v2, s14, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, s15, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, s15, v6 ; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6 -; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, s14, v6 +; GFX9-NEXT: v_mul_hi_u32 v13, s15, v6 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 ; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v5 ; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v10 +; GFX9-NEXT: v_sub_u32_e32 v1, s15, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v10 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s18, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 ; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v11 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v13 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s18, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc @@ -1504,22 +1503,24 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14 -; GFX10-NEXT: s_sub_u32 s0, 0, s12 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s17 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s18 +; GFX10-NEXT: s_sub_u32 s0, 0, s16 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_subb_u32 s1, 0, s13 +; GFX10-NEXT: s_subb_u32 s1, 0, s17 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1539,13 +1540,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v7, 0 -; GFX10-NEXT: s_sub_u32 s2, 0, s14 +; GFX10-NEXT: s_sub_u32 s2, 0, s18 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s3, s2, v8, 0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 -; GFX10-NEXT: s_subb_u32 s3, 0, s15 +; GFX10-NEXT: s_subb_u32 s3, 0, s19 ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] @@ -1592,7 +1593,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1641,21 +1641,20 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v8, v1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u32 v3, s9, v4 -; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v5, s8, v4 -; GFX10-NEXT: v_mul_hi_u32 v4, s9, v4 -; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2 -; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s9, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_mul_lo_u32 v12, s11, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, s10, v0 -; GFX10-NEXT: v_mul_hi_u32 v14, s11, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s13, v4 +; GFX10-NEXT: v_mul_lo_u32 v8, s12, v2 +; GFX10-NEXT: v_mul_hi_u32 v5, s12, v4 +; GFX10-NEXT: v_mul_hi_u32 v4, s13, v4 +; GFX10-NEXT: v_mul_lo_u32 v9, s13, v2 +; GFX10-NEXT: v_mul_lo_u32 v6, s15, v1 +; GFX10-NEXT: v_mul_hi_u32 v10, s12, v2 +; GFX10-NEXT: v_mul_hi_u32 v11, s13, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s14, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s14, v1 +; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, s15, v0 +; GFX10-NEXT: v_mul_hi_u32 v13, s14, v0 +; GFX10-NEXT: v_mul_hi_u32 v14, s15, v0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 @@ -1678,77 +1677,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s12, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s16, v8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s14, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s18, v10, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 ; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2] ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5] ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s8, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s9, v3 +; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s12, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6] +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s16, v14 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s13, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s10, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s11, v0 +; GFX10-NEXT: v_sub_co_u32 v15, s0, s14, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v15 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s12 +; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s16 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v5 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v18 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v18 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v18 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s19, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s12 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s16 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s14 +; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s18 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v16 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v12 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v6 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v12 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s14 +; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo @@ -1759,8 +1758,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[8:9] +; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[10:11] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index c9a9eb9d91724..1f1c2659e8110 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -479,33 +479,33 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] ; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1 -; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB8_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX906-NEXT: .LBB8_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX906-NEXT: s_cbranch_execz .LBB8_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] ; GFX906-NEXT: .LBB8_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -533,31 +533,31 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_4 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_3 ; GFX906-NEXT: ; %bb.2: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] ; GFX906-NEXT: .LBB9_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: .LBB9_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index ef2e57eafbf13..f36dcb487e915 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -18,24 +18,24 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX9-LABEL: constant_load_i8_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i8_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %ld = load i8, ptr addrspace(4) %in, align 4 store i8 %ld, ptr addrspace(1) %out, align 4 @@ -57,24 +57,24 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX9-LABEL: constant_load_i16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_short v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_short v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %ld = load i16, ptr addrspace(4) %in, align 4 store i16 %ld, ptr addrspace(1) %out, align 4 @@ -97,26 +97,26 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i8 s2, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_sext_i32_i8 s0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 4 %sext = sext i8 %load to i32 @@ -140,26 +140,26 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %in, align 4 %sext = sext i16 %load to i32 @@ -183,26 +183,26 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 4 %zext = zext i8 %load to i32 @@ -226,26 +226,26 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s2, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %in, align 4 %zext = zext i16 %load to i32 @@ -269,22 +269,22 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: constant_load_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 store i8 %load, ptr addrspace(1) %out, align 2 @@ -307,22 +307,22 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: constant_load_i16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %in, align 2 store i16 %load, ptr addrspace(1) %out, align 2 @@ -351,24 +351,24 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_sextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] +; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_sextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 %sextload = sext i8 %load to i32 @@ -398,24 +398,24 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_zextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_zextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 %zextload = zext i8 %load to i32 diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index e9797fa1fc309..94d704fa3f92d 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -35,26 +35,26 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: s_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_add_i32: @@ -125,30 +125,30 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s5, s7 -; GFX9-NEXT: s_add_i32 s3, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_add_i32 s1, s1, s3 +; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_add_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s2, s4, s6 -; GFX10-NEXT: s_add_i32 s3, s5, s7 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_add_i32 s0, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_add_v2i32: @@ -813,30 +813,30 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_add_i32: @@ -922,26 +922,26 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_add_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_add_imm_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_add_imm_i32: @@ -1240,50 +1240,50 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_add_u32 s4, s4, s6 -; GFX9-NEXT: s_addc_u32 s5, s5, s7 -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GFX9-NEXT: s_add_u32 s0, s8, s10 +; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccnz .LBB9_3 ; GFX9-NEXT: .LBB9_2: ; %if -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: .LBB9_3: ; %endif ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX9-NEXT: s_branch .LBB9_2 ; ; GFX10-LABEL: add64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_add_u32 s4, s4, s6 -; GFX10-NEXT: s_addc_u32 s5, s5, s7 +; GFX10-NEXT: s_add_u32 s0, s8, s10 +; GFX10-NEXT: s_addc_u32 s1, s9, s11 ; GFX10-NEXT: s_cbranch_execnz .LBB9_3 ; GFX10-NEXT: .LBB9_2: ; %if -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: .LBB9_3: ; %endif ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB9_4: -; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX10-NEXT: s_branch .LBB9_2 ; ; GFX11-LABEL: add64_in_branch: diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 3720b9da52dcd..4cc384e9d2718 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -186,24 +186,24 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v1, s2, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_pk_add_u16 v1, s0, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_add_u16 v1, s0, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_self_v2i16: @@ -300,27 +300,27 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b -; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_constant: @@ -369,27 +369,27 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: @@ -437,26 +437,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, -1 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, -1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: @@ -503,26 +503,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: @@ -570,26 +570,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll index def6df9adf597..88203202a320d 100644 --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX9-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_2: ; %then @@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX10-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll index 77976e470fc78..95f59479c73e8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i ; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1 ; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2 ; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1 -; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16, +; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16, ; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0 ; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2 ; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1 -; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16, +; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16, define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset, ptr addrspace(1) inreg %out) { %v1 = getelementptr i8, ptr addrspace(4) %base, i64 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index cf04dc8e59ead..96e92bb3dce0d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7290,13 +7290,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -7871,12 +7871,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -8138,58 +8138,58 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s4, 0x33fe64 -; GFX9-NEXT: s_add_u32 s4, 0x396, s4 +; GFX9-NEXT: s_mov_b32 s0, 0x33fe64 +; GFX9-NEXT: s_add_u32 s0, 0x396, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000 -; GFX9-NEXT: s_addc_u32 s5, 0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705 -; GFX9-NEXT: s_add_i32 s7, s7, s5 -; GFX9-NEXT: s_sub_i32 s5, s7, s6 -; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5 -; GFX9-NEXT: s_mul_i32 s12, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8 -; GFX9-NEXT: s_add_u32 s6, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8 -; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_sub_i32 s1, s3, s2 +; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1 +; GFX9-NEXT: s_mul_i32 s12, s2, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 +; GFX9-NEXT: s_mul_i32 s10, s0, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 -; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 -; GFX9-NEXT: s_addc_u32 s6, s8, s9 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s5, s4, s5 -; GFX9-NEXT: s_add_u32 s5, s6, s5 -; GFX9-NEXT: s_addc_u32 s6, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 +; GFX9-NEXT: s_addc_u32 s2, s8, s9 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_add_u32 s1, s2, s1 +; GFX9-NEXT: s_addc_u32 s2, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s6, s4, s6 +; GFX9-NEXT: s_addc_u32 s8, s0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s3, s3, s4 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_ashr_i32 s0, s7, 31 +; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_addc_u32 s3, s7, s0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s8, s2, s6 +; GFX9-NEXT: s_mul_i32 s7, s2, s8 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 -; GFX9-NEXT: s_mul_hi_u32 s7, s2, s6 -; GFX9-NEXT: s_add_u32 s8, s10, s8 -; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s8 +; GFX9-NEXT: s_add_u32 s7, s10, s7 +; GFX9-NEXT: s_addc_u32 s6, 0, s6 ; GFX9-NEXT: s_mul_hi_u32 s11, s3, s9 ; GFX9-NEXT: s_mul_i32 s9, s3, s9 -; GFX9-NEXT: s_add_u32 s8, s8, s9 -; GFX9-NEXT: s_mul_hi_u32 s10, s3, s6 -; GFX9-NEXT: s_addc_u32 s7, s7, s11 -; GFX9-NEXT: s_addc_u32 s8, s10, 0 -; GFX9-NEXT: s_mul_i32 s6, s3, s6 -; GFX9-NEXT: s_add_u32 s6, s7, s6 -; GFX9-NEXT: s_addc_u32 s7, 0, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX9-NEXT: s_addc_u32 s6, s6, s11 +; GFX9-NEXT: s_addc_u32 s7, s10, 0 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s6, s6, s8 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 ; GFX9-NEXT: s_add_u32 s8, s6, 1 ; GFX9-NEXT: s_addc_u32 s9, s7, 0 ; GFX9-NEXT: s_add_u32 s10, s6, 2 @@ -8222,13 +8222,13 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b32 s3, s3, s7 ; GFX9-NEXT: s_cselect_b32 s2, s8, s6 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_sub_u32 s2, s2, s4 -; GFX9-NEXT: s_subb_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_u32 s2, s2, s0 +; GFX9-NEXT: s_subb_u32 s3, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, ptr addrspace(1) %out @@ -8261,17 +8261,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_u32 s2, s2, s4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_addc_u32 s1, s7, 0 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -9132,19 +9132,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_sub_u32 s0, 0, s12 -; GFX9-NEXT: s_subb_u32 s1, 0, s13 +; GFX9-NEXT: s_ashr_i32 s12, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s12 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_addc_u32 s1, s1, s12 +; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s14 +; GFX9-NEXT: s_subb_u32 s1, 0, s15 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9153,60 +9153,60 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s15, v0 -; GFX9-NEXT: s_mul_i32 s16, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s18, s0, s15 -; GFX9-NEXT: s_mul_i32 s17, s1, s15 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s16, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s18, s0, s3 +; GFX9-NEXT: s_mul_i32 s17, s1, s3 ; GFX9-NEXT: s_add_i32 s16, s18, s16 -; GFX9-NEXT: s_mul_i32 s19, s0, s15 +; GFX9-NEXT: s_mul_i32 s19, s0, s3 ; GFX9-NEXT: s_add_i32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 -; GFX9-NEXT: s_mul_i32 s18, s15, s16 -; GFX9-NEXT: s_mul_hi_u32 s15, s15, s19 -; GFX9-NEXT: s_add_u32 s15, s15, s18 +; GFX9-NEXT: s_mul_hi_u32 s17, s3, s16 +; GFX9-NEXT: s_mul_i32 s18, s3, s16 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, s19 +; GFX9-NEXT: s_add_u32 s3, s3, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 -; GFX9-NEXT: s_mul_i32 s19, s14, s19 -; GFX9-NEXT: s_add_u32 s15, s15, s19 -; GFX9-NEXT: s_mul_hi_u32 s18, s14, s16 -; GFX9-NEXT: s_addc_u32 s15, s17, s20 +; GFX9-NEXT: s_mul_hi_u32 s20, s2, s19 +; GFX9-NEXT: s_mul_i32 s19, s2, s19 +; GFX9-NEXT: s_add_u32 s3, s3, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s2, s16 +; GFX9-NEXT: s_addc_u32 s3, s17, s20 ; GFX9-NEXT: s_addc_u32 s17, s18, 0 -; GFX9-NEXT: s_mul_i32 s16, s14, s16 -; GFX9-NEXT: s_add_u32 s15, s15, s16 +; GFX9-NEXT: s_mul_i32 s16, s2, s16 +; GFX9-NEXT: s_add_u32 s3, s3, s16 ; GFX9-NEXT: s_addc_u32 s16, 0, s17 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s15, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s14, s14, s16 +; GFX9-NEXT: s_addc_u32 s2, s2, s16 ; GFX9-NEXT: v_readfirstlane_b32 s16, v0 -; GFX9-NEXT: s_mul_i32 s15, s0, s14 +; GFX9-NEXT: s_mul_i32 s3, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16 -; GFX9-NEXT: s_add_i32 s15, s17, s15 +; GFX9-NEXT: s_add_i32 s3, s17, s3 ; GFX9-NEXT: s_mul_i32 s1, s1, s16 -; GFX9-NEXT: s_add_i32 s15, s15, s1 +; GFX9-NEXT: s_add_i32 s3, s3, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s16 -; GFX9-NEXT: s_mul_hi_u32 s17, s14, s0 -; GFX9-NEXT: s_mul_i32 s18, s14, s0 -; GFX9-NEXT: s_mul_i32 s20, s16, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s2, s0 +; GFX9-NEXT: s_mul_i32 s18, s2, s0 +; GFX9-NEXT: s_mul_i32 s20, s16, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0 -; GFX9-NEXT: s_mul_hi_u32 s19, s16, s15 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s20 ; GFX9-NEXT: s_addc_u32 s16, 0, s19 ; GFX9-NEXT: s_add_u32 s0, s0, s18 -; GFX9-NEXT: s_mul_hi_u32 s1, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 ; GFX9-NEXT: s_addc_u32 s0, s16, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s15, s14, s15 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s3 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s16, s14, s1 -; GFX9-NEXT: s_ashr_i32 s14, s5, 31 -; GFX9-NEXT: s_add_u32 s0, s4, s14 -; GFX9-NEXT: s_mov_b32 s15, s14 -; GFX9-NEXT: s_addc_u32 s1, s5, s14 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX9-NEXT: s_addc_u32 s16, s2, s1 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s0, s4, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s5, s2 +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s17, v0 ; GFX9-NEXT: s_mul_i32 s1, s4, s16 ; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 @@ -9222,24 +9222,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s16, s5, s16 ; GFX9-NEXT: s_add_u32 s16, s0, s16 ; GFX9-NEXT: s_addc_u32 s17, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s12, s17 -; GFX9-NEXT: s_mul_hi_u32 s1, s12, s16 +; GFX9-NEXT: s_mul_i32 s0, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s1, s14, s16 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s13, s16 +; GFX9-NEXT: s_mul_i32 s1, s15, s16 ; GFX9-NEXT: s_add_i32 s18, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s12, s16 +; GFX9-NEXT: s_mul_i32 s1, s14, s16 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_sub_i32 s0, s5, s18 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s4, s0, s13 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 +; GFX9-NEXT: s_subb_u32 s4, s0, s15 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s14, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s4, s4, 0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s13 +; GFX9-NEXT: s_cmp_ge_u32 s4, s15 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 -; GFX9-NEXT: s_cmp_eq_u32 s4, s13 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v1 +; GFX9-NEXT: s_cmp_eq_u32 s4, s15 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -9257,35 +9257,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s5, s18 -; GFX9-NEXT: s_cmp_ge_u32 s0, s13 +; GFX9-NEXT: s_cmp_ge_u32 s0, s15 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s13 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 +; GFX9-NEXT: s_cmp_eq_u32 s0, s15 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] -; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] +; GFX9-NEXT: s_ashr_i32 s2, s11, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: s_add_u32 s8, s10, s4 +; GFX9-NEXT: s_add_u32 s4, s10, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s9, s11, s4 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s5, s11, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1 ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_sub_u32 s0, 0, s4 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: s_subb_u32 s1, 0, s5 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -9362,24 +9362,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s7, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s8, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 +; GFX9-NEXT: s_mul_i32 s0, s4, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s4, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s12 +; GFX9-NEXT: s_mul_i32 s1, s5, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s12 +; GFX9-NEXT: s_mul_i32 s1, s4, s12 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v2 +; GFX9-NEXT: s_subb_u32 s6, s0, s5 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_cmp_ge_u32 s6, s5 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 -; GFX9-NEXT: s_cmp_eq_u32 s6, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 +; GFX9-NEXT: s_cmp_eq_u32 s6, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -9397,10 +9397,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cmp_ge_u32 s0, s5 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX9-NEXT: s_cmp_eq_u32 s0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -9410,13 +9410,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -9525,100 +9526,100 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s4, 0x33fe64 -; GFX9-NEXT: s_add_u32 s4, 0x396, s4 +; GFX9-NEXT: s_mov_b32 s0, 0x33fe64 +; GFX9-NEXT: s_add_u32 s0, 0x396, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000 -; GFX9-NEXT: s_addc_u32 s5, 0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705 -; GFX9-NEXT: s_add_i32 s7, s7, s5 -; GFX9-NEXT: s_sub_i32 s5, s7, s6 -; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5 -; GFX9-NEXT: s_mul_i32 s12, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8 -; GFX9-NEXT: s_add_u32 s6, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8 -; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705 +; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_sub_i32 s1, s3, s2 +; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1 +; GFX9-NEXT: s_mul_i32 s12, s2, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 +; GFX9-NEXT: s_mul_i32 s10, s0, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 -; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 -; GFX9-NEXT: s_addc_u32 s6, s8, s9 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s5, s4, s5 -; GFX9-NEXT: s_add_u32 s5, s6, s5 -; GFX9-NEXT: s_addc_u32 s6, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 +; GFX9-NEXT: s_addc_u32 s2, s8, s9 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_mul_i32 s1, s0, s1 +; GFX9-NEXT: s_add_u32 s1, s2, s1 +; GFX9-NEXT: s_addc_u32 s2, 0, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s6, s4, s6 +; GFX9-NEXT: s_addc_u32 s8, s0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_u32 s2, s2, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s3, s3, s4 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s7, s2, s6 -; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_add_u32 s7, s9, s7 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8 -; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s7, s7, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s3, s6 -; GFX9-NEXT: s_addc_u32 s5, s5, s10 -; GFX9-NEXT: s_addc_u32 s7, s9, 0 -; GFX9-NEXT: s_mul_i32 s6, s3, s6 -; GFX9-NEXT: s_add_u32 s5, s5, s6 -; GFX9-NEXT: s_addc_u32 s6, 0, s7 -; GFX9-NEXT: s_mul_hi_u32 s8, s5, 0x12d8fb -; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb +; GFX9-NEXT: s_ashr_i32 s0, s7, 31 +; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_addc_u32 s3, s7, s0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s8 +; GFX9-NEXT: s_add_u32 s6, s9, s6 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s10, s3, s7 +; GFX9-NEXT: s_mul_i32 s7, s3, s7 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s8 +; GFX9-NEXT: s_addc_u32 s1, s1, s10 +; GFX9-NEXT: s_addc_u32 s6, s9, 0 +; GFX9-NEXT: s_mul_i32 s7, s3, s8 +; GFX9-NEXT: s_add_u32 s1, s1, s7 +; GFX9-NEXT: s_addc_u32 s6, 0, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s1, 0x12d8fb +; GFX9-NEXT: s_mul_i32 s1, s1, 0x12d8fb ; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_add_i32 s8, s8, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s2, s3, s8 +; GFX9-NEXT: s_subb_u32 s1, s3, s8 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s3, s2, 0 +; GFX9-NEXT: s_subb_u32 s2, s1, 0 ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s5, s3, 0 +; GFX9-NEXT: s_subb_u32 s3, s2, 0 ; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: s_cmp_eq_u32 s3, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cmp_eq_u32 s1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, ptr addrspace(1) %out @@ -9653,19 +9654,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_u32 s4, s2, s4 -; GFX9-NEXT: s_addc_u32 s5, s3, 0 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s2, s2, s4 -; GFX9-NEXT: s_subb_u32 s3, s3, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_addc_u32 s1, s7, 0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s0, s6, s0 +; GFX9-NEXT: s_subb_u32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 4096 store i64 %r, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index d6137597293f6..c6233642110ea 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, ; SI: s_load_dword [[B:s[0-9]+]] ; SI: s_load_dwordx2 ; SI-NOT: and -; SI: s_lshl_b32 [[A]], [[A]], 1 -; SI: s_lshl_b32 [[B]], [[B]], 1 -; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62 -; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62 +; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1 +; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1 +; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62 +; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62 ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) { @@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink: ; SI: s_load_dword [[A:s[0-9]+]] -; SI: s_lshl_b32 [[A]], [[A]], 1{{$}} +; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}} ; SI-NOT: and -; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64 +; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64 ; SI-NOT: and ; SI: s_add_u32 ; SI-NEXT: s_addc_u32 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index b1134ae78cb97..6ec6f6aebf2b1 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -427,13 +427,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10W32-NEXT: s_mov_b32 null, 0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: @@ -1921,14 +1922,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index bc5d2662dcb45..da88b6792f269 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -89,101 +89,101 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_constant: @@ -766,137 +766,137 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-LABEL: add_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s8, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s8, s8, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s15, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s14, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 -; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_mov_b32 s12, s6 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s13, s7 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 +; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB2_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 -; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s0, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: add_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s8, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s8, s8, s7 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s0, v1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: add_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s0, v1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: add_i32_varying: @@ -1180,17 +1180,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: add_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1202,33 +1202,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s2 -; GFX9_DPP-NEXT: s_mov_b32 s9, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_DPP-NEXT: s_mov_b32 s8, s6 +; GFX9_DPP-NEXT: s_mov_b32 s9, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB2_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 -; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i32_varying: @@ -1249,50 +1249,50 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s0, s9 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i32_varying: @@ -1309,44 +1309,44 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i32_varying: @@ -1712,110 +1712,109 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 ; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1] +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1] +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_constant: @@ -2480,160 +2479,160 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: add_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 -; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB5_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s1, v1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 ; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc -; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: add_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 -; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 -; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB5_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 -; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 -; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s0, v1 +; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v2, vcc +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: add_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 -; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 -; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB5_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 -; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 -; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s0, v1 +; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: @@ -2987,22 +2986,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -3050,39 +3049,39 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s2 -; GFX9_DPP-NEXT: s_mov_b32 s9, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX9_DPP-NEXT: s_mov_b32 s8, s6 +; GFX9_DPP-NEXT: s_mov_b32 s9, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB5_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s1, v7 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i64_varying: @@ -3145,59 +3144,59 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB5_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 -; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s2, v11 -; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: @@ -3248,51 +3247,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB5_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11 -; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i64_varying: @@ -3808,104 +3807,104 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 ; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_constant: @@ -4498,137 +4497,137 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-LABEL: sub_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s8, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s8, s8, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s15, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s14, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 -; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_mov_b32 s12, s6 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s13, s7 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 +; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB8_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 -; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s0, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: sub_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s8, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s8, s8, s7 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB8_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s0, v1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: sub_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB8_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s0, v1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: sub_i32_varying: @@ -4912,17 +4911,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: sub_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4934,33 +4933,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s2 -; GFX9_DPP-NEXT: s_mov_b32 s9, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_DPP-NEXT: s_mov_b32 s8, s6 +; GFX9_DPP-NEXT: s_mov_b32 s9, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB8_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: sub_i32_varying: @@ -4981,50 +4980,50 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s0, s9 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB8_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i32_varying: @@ -5041,44 +5040,44 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB8_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i32_varying: @@ -5445,117 +5444,117 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 ; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i64_constant: @@ -6253,160 +6252,160 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: sub_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 -; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB11_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 ; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc -; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: sub_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 -; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 -; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB11_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 -; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 -; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s0, v1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v2, vcc +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: sub_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 -; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 -; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB11_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 -; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 -; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 -; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: @@ -6760,22 +6759,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -6823,39 +6822,39 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s2 -; GFX9_DPP-NEXT: s_mov_b32 s9, s3 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX9_DPP-NEXT: s_mov_b32 s8, s6 +; GFX9_DPP-NEXT: s_mov_b32 s9, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB11_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s1, v7 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: sub_i64_varying: @@ -6918,59 +6917,59 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB11_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 -; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s2, v11 -; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s3, v12, vcc -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: @@ -7021,51 +7020,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 +; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB11_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11 -; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i64_varying: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 1439d4b40c951..eb05613da0453 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1839,111 +1839,112 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_mul_i32 s3, s7, s2 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2] +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s7, s3, s6 -; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX1064-NEXT: s_mul_i32 s6, s2, s6 -; GFX1064-NEXT: s_add_i32 s8, s8, s7 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mul_i32 s3, s7, s2 +; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2 +; GFX1064-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064-NEXT: s_add_i32 s8, s8, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: v_mov_b32_e32 v1, s8 ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1] +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2] +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s6, s3, s5 -; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1032-NEXT: s_mul_i32 s5, s2, s5 -; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: s_mul_i32 s2, s7, s1 +; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1 +; GFX1032-NEXT: s_mul_i32 s1, s6, s1 +; GFX1032-NEXT: s_add_i32 s3, s3, s2 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1] +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2] +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_uniform: @@ -5501,119 +5502,119 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_mul_i32 s3, s7, s2 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB13_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s7, s3, s6 -; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX1064-NEXT: s_mul_i32 s6, s2, s6 -; GFX1064-NEXT: s_add_i32 s8, s8, s7 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mul_i32 s3, s7, s2 +; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2 +; GFX1064-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064-NEXT: s_add_i32 s8, s8, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: v_mov_b32_e32 v1, s8 ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB13_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s6, s3, s5 -; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1032-NEXT: s_mul_i32 s5, s2, s5 -; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: s_mul_i32 s2, s7, s1 +; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1 +; GFX1032-NEXT: s_mul_i32 s1, s6, s1 +; GFX1032-NEXT: s_add_i32 s3, s3, s2 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5] +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i64_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index f636fa5d83a57..7ae4766584e00 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -426,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10W32-NEXT: s_mov_b32 null, 0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: @@ -1504,14 +1505,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 3e8565d34c6be..62e50b2597c17 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -439,13 +439,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10W32-NEXT: s_mov_b32 null, 0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: @@ -1674,14 +1675,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 0bd030f1a3750..04d72691a088a 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index f248708d16ea2..0cc10512af5cd 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -157,26 +157,26 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: sadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876 -; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876 +; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: sadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876 -; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234 -; GFX1010-NEXT: v_mov_b32_e32 v0, s2 -; GFX1010-NEXT: v_mov_b32_e32 v1, s3 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876 +; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: sadd64ri: @@ -255,23 +255,23 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: vadd64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: vadd64rr: @@ -679,34 +679,34 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: suaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s6, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_addc_u32 s7, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_add_u32 s0, s8, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] -; GFX9-NEXT: global_store_byte v4, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX9-NEXT: global_store_byte v4, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: suaddo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s6, s4, s6 -; GFX1010-NEXT: s_addc_u32 s7, s5, s7 -; GFX1010-NEXT: v_mov_b32_e32 v0, s6 -; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] -; GFX1010-NEXT: v_mov_b32_e32 v1, s7 -; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1010-NEXT: global_store_byte v2, v3, s[2:3] +; GFX1010-NEXT: s_add_u32 s0, s8, s10 +; GFX1010-NEXT: s_addc_u32 s1, s9, s11 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] +; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: suaddo64: @@ -1048,26 +1048,26 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: ssub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2 -; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6 +; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: ssub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2 -; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3 -; GFX1010-NEXT: v_mov_b32_e32 v0, s2 -; GFX1010-NEXT: v_mov_b32_e32 v1, s3 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6 +; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: ssub64ri: @@ -1146,23 +1146,23 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vsub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: vsub64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0 -; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0 +; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: vsub64rr: @@ -1571,34 +1571,34 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: susubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s6, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_subb_u32 s7, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_sub_u32 s0, s8, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_subb_u32 s1, s9, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] -; GFX9-NEXT: global_store_byte v4, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX9-NEXT: global_store_byte v4, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: susubo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s6, s4, s6 -; GFX1010-NEXT: s_subb_u32 s7, s5, s7 -; GFX1010-NEXT: v_mov_b32_e32 v0, s6 -; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] -; GFX1010-NEXT: v_mov_b32_e32 v1, s7 -; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1010-NEXT: global_store_byte v2, v3, s[2:3] +; GFX1010-NEXT: s_sub_u32 s0, s8, s10 +; GFX1010-NEXT: s_subb_u32 s1, s9, s11 +; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9] +; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: susubo64: diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index e1717a816de0d..45324392aacde 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -40,13 +40,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f32: @@ -117,14 +117,14 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -194,13 +194,13 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_dbg_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_dbg_use_src_f32: @@ -267,14 +267,14 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_add_neg_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_floor_f32_e32 v1, v1 ; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_neg_src_f32: @@ -342,14 +342,14 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_non_clamp_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_non_clamp_max_f32: @@ -413,13 +413,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_add_src_f32_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f32_denormals: @@ -485,13 +485,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_add_src_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f16_denorm: @@ -557,13 +557,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals: @@ -629,14 +629,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_add_src_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f32: @@ -701,13 +701,13 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f64: @@ -866,13 +866,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm: @@ -947,13 +947,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals: @@ -1038,14 +1038,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg: @@ -1124,14 +1124,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: @@ -1212,14 +1212,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: @@ -1298,14 +1298,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf: @@ -1382,14 +1382,14 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src: @@ -1469,14 +1469,14 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_no_clamp_add_packed_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_no_clamp_add_packed_src_f32: @@ -1553,15 +1553,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 9b6c50c10d90d..94482d7f0d809 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -41,13 +41,13 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32: @@ -126,13 +126,13 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_f32: @@ -212,13 +212,13 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_f32: @@ -304,15 +304,15 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negzero_f32: @@ -400,15 +400,15 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: @@ -498,15 +498,15 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -600,13 +600,13 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f16: @@ -686,13 +686,13 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_f16: @@ -773,13 +773,13 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_f16: @@ -861,13 +861,13 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f64: @@ -946,13 +946,13 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_f64: @@ -1032,13 +1032,13 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_f64: @@ -1122,14 +1122,14 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_brev_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_brev_b32 s2, 1 +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_med3_f32 v1, s0, 1.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: @@ -1206,13 +1206,13 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_aby_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_aby_f32: @@ -1289,13 +1289,13 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bay_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bay_f32: @@ -1372,13 +1372,13 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yab_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yab_f32: @@ -1455,13 +1455,13 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yba_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yba_f32: @@ -1538,13 +1538,13 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_ayb_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_ayb_f32: @@ -1621,13 +1621,13 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bya_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bya_f32: @@ -2091,14 +2091,14 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: @@ -2179,13 +2179,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: @@ -2267,14 +2267,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: @@ -2356,14 +2356,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: @@ -2444,13 +2444,13 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: @@ -2527,13 +2527,13 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: @@ -2610,13 +2610,13 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: @@ -2693,13 +2693,13 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: @@ -2776,13 +2776,13 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: @@ -2859,13 +2859,13 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: @@ -3078,13 +3078,13 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16: @@ -3180,13 +3180,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_v2f16_undef_elt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_undef_elt: @@ -3277,15 +3277,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_clamp_v2f16_not_zero: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_not_zero: @@ -3381,15 +3381,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_not_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_not_one: @@ -3483,13 +3483,13 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_clamp_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_v2f16: @@ -3578,14 +3578,14 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_negabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_v2f16: @@ -3678,13 +3678,13 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neglo_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neglo_v2f16: @@ -3774,13 +3774,13 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neghi_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neghi_v2f16: @@ -3870,13 +3870,13 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_shuffle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_shuffle: @@ -3973,13 +3973,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: @@ -4075,13 +4075,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: @@ -4163,18 +4163,18 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_diff_source_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_f32_e32 v1, s0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, s0, v2 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] offset:12 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_diff_source_f32: diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index 48bd8f9b80799..1e2a2dcf1e333 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadCombine: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -37,14 +37,14 @@ entry: define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_mov_b32 s0, 0x7050604 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x7050604 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_perm_b32 v2, v2, v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll index c57ee9cc6a1e2..ce181b6223e41 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) % ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s2, 2, 3 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, 2, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: global_store_dword v1, v0, s[4:5] ; GCN-NEXT: s_endpgm entry: ; preds = %1009 %0 = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index eb959e30b87f4..f3f749b5c054b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2838,16 +2838,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 6799980c18439..de14d64dbf7e9 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -8,13 +8,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_add v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_and v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_or v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: nand: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_not_b32_e32 v0, v3 ; CHECK-NEXT: v_or_b32_e32 v2, -2, v0 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: cmpxchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace( define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fsub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] @@ -519,14 +519,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 6db65752db54f..5fae60a7acac1 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -277,12 +277,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND @@ -290,12 +290,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5 +; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s0 ; GFX906-NEXT: ;;#ASMEND @@ -561,12 +561,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND @@ -574,12 +574,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5 +; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s0 ; GFX906-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 9f191fa69f654..9649cdc6001cd 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 @@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 @@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_too_far_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; ; GFX9-LABEL: simple_write2_two_val_f32_x2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 @@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 @@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index f53d3cf33c9cc..7c89efd0a713c 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -197,24 +197,24 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rcp_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16: @@ -293,24 +293,24 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_abs: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e64 v1, |v1| -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16_abs: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e64 v1, |v1| -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16_abs: @@ -392,24 +392,24 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX9-LABEL: reciprocal_f16_rounded: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: reciprocal_f16_rounded: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: reciprocal_f16_rounded: @@ -475,24 +475,24 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16_afn: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16_afn: @@ -571,24 +571,24 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16_neg: @@ -670,24 +670,24 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rsq_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rsq_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rsq_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16: @@ -771,26 +771,26 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rsq_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rsq_f16_e32 v1, v1 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rsq_f16_e32 v1, v1 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_neg: @@ -879,28 +879,28 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX9-LABEL: v_rsq_f16_multi_use: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rsq_f16_e32 v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v2, s[0:1] +; GFX9-NEXT: global_store_short v0, v2, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_multi_use: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rsq_f16_e32 v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-NEXT: global_store_short v0, v2, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_multi_use: @@ -987,26 +987,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract0: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_missing_contract0: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_missing_contract0: @@ -1092,26 +1092,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_missing_contract1: @@ -1197,26 +1197,26 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 8510e26a3eafb..e0377ddad14c0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -5279,15 +5279,15 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5328,15 +5328,15 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5689,15 +5689,15 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5738,15 +5738,15 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6099,15 +6099,15 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6148,15 +6148,15 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6394,15 +6394,15 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6443,15 +6443,15 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7979,15 +7979,15 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %gep = getelementptr half, ptr %in, i64 8 @@ -8027,15 +8027,15 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %val = load atomic half, ptr %in seq_cst, align 2 @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %in, i64 8 @@ -8126,15 +8126,15 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %val = load atomic bfloat, ptr %in seq_cst, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 7e4a36b7dc11b..3fd624b592cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -4439,23 +4439,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: v_mov_b32_e32 v5, s4 +; GCN3-NEXT: v_mov_b32_e32 v4, s9 +; GCN3-NEXT: v_mov_b32_e32 v5, s8 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 ; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc @@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_endpgm entry: @@ -4654,23 +4654,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_max_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: v_mov_b32_e32 v5, s4 +; GCN3-NEXT: v_mov_b32_e32 v4, s9 +; GCN3-NEXT: v_mov_b32_e32 v5, s8 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 ; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc @@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_endpgm entry: @@ -5821,23 +5821,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: v_mov_b32_e32 v5, s4 +; GCN3-NEXT: v_mov_b32_e32 v4, s9 +; GCN3-NEXT: v_mov_b32_e32 v5, s8 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 ; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc @@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_endpgm entry: @@ -5934,23 +5934,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: v_mov_b32_e32 v5, s4 +; GCN3-NEXT: v_mov_b32_e32 v4, s9 +; GCN3-NEXT: v_mov_b32_e32 v5, s8 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 ; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc @@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_endpgm entry: @@ -8045,23 +8045,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: v_mov_b32_e32 v5, s4 +; GCN3-NEXT: v_mov_b32_e32 v4, s9 +; GCN3-NEXT: v_mov_b32_e32 v5, s8 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 ; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc @@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_endpgm entry: @@ -8148,20 +8148,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN3-LABEL: atomic_min_i64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -8253,23 +8253,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_min_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: v_mov_b32_e32 v5, s4 +; GCN3-NEXT: v_mov_b32_e32 v4, s9 +; GCN3-NEXT: v_mov_b32_e32 v5, s8 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 ; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc @@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 84852c2632f67..f86f5305e6ba1 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_0_f32: @@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_1_f32: @@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_0_f16: @@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_1_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 84099e472d65f..6634d36122d0a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -83,14 +83,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: @@ -195,14 +195,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: @@ -308,14 +308,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: @@ -421,14 +421,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: @@ -538,15 +538,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: @@ -668,33 +668,33 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 ; GFX9-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; @@ -831,15 +831,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_fmed3_r_i_i_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_r_i_i_f64: @@ -943,13 +943,13 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: @@ -1057,29 +1057,29 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc ; GFX9-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: @@ -1244,33 +1244,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: @@ -1438,33 +1438,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: @@ -1632,33 +1632,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: @@ -1828,34 +1828,34 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: @@ -2034,35 +2034,35 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| ; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: @@ -2250,20 +2250,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: @@ -2415,17 +2415,17 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: @@ -2569,17 +2569,17 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_nnan_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_call_med3_f32_pat0: @@ -2723,17 +2723,17 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_fast_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fast_call_med3_f32_pat0: @@ -2889,17 +2889,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: @@ -3043,17 +3043,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: @@ -3199,33 +3199,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: @@ -3391,17 +3391,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: @@ -3545,17 +3545,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: @@ -3699,17 +3699,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: @@ -3853,17 +3853,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: @@ -4007,17 +4007,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: @@ -4161,17 +4161,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: @@ -4315,17 +4315,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: @@ -4469,17 +4469,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: @@ -4623,17 +4623,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: @@ -4777,17 +4777,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: @@ -4931,17 +4931,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: @@ -5085,17 +5085,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: @@ -5239,17 +5239,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: @@ -5393,17 +5393,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: @@ -5550,17 +5550,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: @@ -5743,14 +5743,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -5761,7 +5761,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: @@ -5947,14 +5947,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -5965,7 +5965,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: @@ -6178,14 +6178,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -6196,7 +6196,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: @@ -6370,14 +6370,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -6386,7 +6386,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: @@ -6569,20 +6569,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: @@ -6746,20 +6746,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: @@ -6923,20 +6923,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: @@ -7090,33 +7090,33 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: @@ -7295,39 +7295,39 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_min_f32_e64 v4, -v1, v2 ; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1 ; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 ; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: @@ -7502,18 +7502,18 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_global_nnans_min_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_min_max_f32: @@ -7637,14 +7637,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: @@ -7825,20 +7825,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: @@ -7963,15 +7963,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: two_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: two_non_inline_constant: @@ -8114,16 +8114,16 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: one_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 ; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -8271,18 +8271,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x41000000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1 ; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1 ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1 -; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2 -; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s0, v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off @@ -8291,18 +8291,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1 ; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1 ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 ; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 3a55b2d50a5e5..2025ddb07e83a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_0_f32: @@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_1_f32: @@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_0_f16: @@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_1_f16: @@ -680,36 +680,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_0_f64: @@ -827,36 +827,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s2 -; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s5 -; GFX9-NEXT: s_mov_b32 s18, s10 -; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s16, s8 +; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s18, s2 +; GFX9-NEXT: s_mov_b32 s19, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; GFX9-NEXT: s_mov_b32 s8, s10 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_1_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index 7c1c970b3fef7..7b8384f317c6c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -111,23 +111,41 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fmul_f16_imm_a: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fmul_f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16_imm_a: ; GFX11: ; %bb.0: ; %entry @@ -178,23 +196,41 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fmul_f16_imm_b: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fmul_f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16_imm_b: ; GFX11: ; %bb.0: ; %entry @@ -390,21 +426,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX9-LABEL: fmul_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x44004200 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16_imm_a: @@ -485,21 +521,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX9-LABEL: fmul_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x42004400 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16_imm_b: @@ -725,23 +761,23 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX9-LABEL: fmul_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s2, 0x44004200 -; GFX9-NEXT: s_mov_b32 s3, 0x40004800 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s6, 0x44004200 +; GFX9-NEXT: s_mov_b32 s7, 0x40004800 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, s6 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s7 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 70cdfeb6d4954..2b556a0be2b16 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -62,32 +62,32 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5] ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16: @@ -176,48 +176,48 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: @@ -326,32 +326,32 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5] -; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9] +; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5] ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index ce1fcccf4a17c..d964cf4858f09 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1022,22 +1022,22 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: v_mov_b32_e32 v1, s3 -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s7 +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -1047,22 +1047,22 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: v_mov_b32_e32 v1, s3 -; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s7 +; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -1072,22 +1072,22 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: v_mov_b32_e32 v1, s3 -; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s7 +; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) @@ -1139,14 +1139,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1166,13 +1166,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1187,14 +1187,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1249,14 +1249,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1297,14 +1297,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1519,14 +1519,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1539,12 +1539,12 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1610,12 +1610,12 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1760,23 +1760,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s7 ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: v_mov_b32_e32 v3, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v3, s7 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_endpgm main_body: @@ -1806,12 +1806,12 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1846,23 +1846,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s7 ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: v_mov_b32_e32 v3, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v3, s7 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_endpgm main_body: @@ -1892,23 +1892,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s7 ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: v_mov_b32_e32 v3, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v3, s7 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 8c6dc4395839c..75f4dff14fcbd 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -23,23 +23,41 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -89,24 +107,43 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fpext_f16_to_f64: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fpext_f16_to_f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fpext_f16_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fpext_f16_to_f64: ; GFX11: ; %bb.0: ; %entry @@ -159,24 +196,43 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fpext_v2f16_to_v2f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fpext_v2f16_to_v2f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fpext_v2f16_to_v2f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fpext_v2f16_to_v2f32: ; GFX11: ; %bb.0: ; %entry @@ -232,26 +288,47 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fpext_v2f16_to_v2f64: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fpext_v2f16_to_v2f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fpext_v2f16_to_v2f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fpext_v2f16_to_v2f64: ; GFX11: ; %bb.0: ; %entry @@ -350,23 +427,41 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fneg_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fneg_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -416,23 +511,41 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fabs_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fabs_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fabs_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -482,23 +595,41 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fneg_fabs_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -555,27 +686,49 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_endpgm +; VI-LABEL: fneg_multi_use_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -636,27 +789,49 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0 -; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_endpgm +; VI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; VI-NEXT: v_mul_f16_e64 v0, -v0, v0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX9-NEXT: v_mul_f16_e64 v0, -v0, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -716,27 +891,49 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_endpgm +; VI-LABEL: fabs_multi_use_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -797,27 +994,49 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0 -; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_endpgm +; VI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; VI-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX9-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -877,27 +1096,49 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_endpgm +; VI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -959,27 +1200,49 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0 -; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_endpgm +; VI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; VI-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX9-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry @@ -1020,6 +1283,3 @@ entry: declare half @llvm.fabs.f16(half) #1 attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9: {{.*}} -; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 65ac2e240469d..0817ac1b3cd67 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -71,32 +71,32 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: @@ -208,34 +208,34 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: @@ -358,36 +358,36 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1 ; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: @@ -519,40 +519,40 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: @@ -673,32 +673,32 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2 -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: @@ -807,32 +807,32 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0| +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: @@ -941,32 +941,32 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0| -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2| -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0| +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: @@ -1076,32 +1076,32 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: @@ -1215,32 +1215,32 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0| +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: @@ -1359,34 +1359,34 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 ; GFX9-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index f6df1cbbdd06b..2a79793443fb2 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -111,23 +111,41 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fsub_f16_imm_a: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fsub_f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fsub_f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16_imm_a: ; GFX11: ; %bb.0: ; %entry @@ -178,23 +196,41 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: fsub_f16_imm_b: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: fsub_f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_add_f16_e32 v0, -2.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fsub_f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_add_f16_e32 v0, -2.0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16_imm_b: ; GFX11: ; %bb.0: ; %entry @@ -390,21 +426,21 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX9-LABEL: fsub_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16_imm_a: @@ -485,21 +521,21 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX9-LABEL: fsub_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0xbc00c000 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16_imm_b: diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index 7f6a3ad5c9346..31069e567d86d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -135,10 +135,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s5 ; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX908-NEXT: s_endpgm ; @@ -147,9 +147,9 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index fe6d467bb6281..16c09cf600080 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -5105,13 +5105,13 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %in, i64 4 @@ -5159,13 +5159,13 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128 @@ -5211,13 +5211,13 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(1) %in, i64 4 @@ -5261,13 +5261,13 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -5812,13 +5812,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc +; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %in, i64 16 @@ -5866,13 +5866,13 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc +; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512 @@ -5993,13 +5993,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %in, i64 8 @@ -6047,13 +6047,13 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256 @@ -7074,13 +7074,13 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %gep = getelementptr half, ptr addrspace(1) %in, i64 8 %val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2 @@ -7127,13 +7127,13 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %gep = getelementptr half, ptr addrspace(1) %in, i64 -256 %val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2 @@ -7176,13 +7176,13 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8 %val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2 @@ -7229,13 +7229,13 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256 %val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 40f0acf3d5d09..d1a371fc4356f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -33,12 +33,12 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_add_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -259,18 +259,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: @@ -331,12 +331,12 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_add_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -550,18 +550,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: @@ -617,12 +617,12 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_and_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -843,18 +843,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: @@ -915,12 +915,12 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_and_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1134,18 +1134,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: @@ -1201,12 +1201,12 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_sub_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1427,18 +1427,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: @@ -1499,12 +1499,12 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_sub_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1718,18 +1718,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: @@ -1781,12 +1781,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_max_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_offset: @@ -1994,17 +1994,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: @@ -2061,12 +2061,12 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_max_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64: @@ -2267,17 +2267,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: @@ -2329,12 +2329,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_offset: @@ -2542,17 +2542,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -2609,12 +2609,12 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umax_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64: @@ -2815,17 +2815,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: @@ -2877,12 +2877,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_min_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_offset: @@ -3090,17 +3090,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: @@ -3157,12 +3157,12 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64: @@ -3363,17 +3363,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: @@ -3425,12 +3425,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_offset: @@ -3638,17 +3638,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: @@ -3705,12 +3705,12 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umin_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64: @@ -3911,17 +3911,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: @@ -3977,12 +3977,12 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_or_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4203,18 +4203,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: @@ -4275,12 +4275,12 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_or_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4494,18 +4494,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: @@ -4561,12 +4561,12 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4617,12 +4617,12 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX9-LABEL: atomic_xchg_f64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4673,12 +4673,12 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_pointer_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4899,18 +4899,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: @@ -4971,12 +4971,12 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -5190,18 +5190,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: @@ -5257,12 +5257,12 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_xor_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -5483,18 +5483,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: @@ -5555,12 +5555,12 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xor_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -5774,18 +5774,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: @@ -6001,17 +6001,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: @@ -6079,16 +6079,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6329,17 +6329,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: @@ -6404,16 +6404,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6573,13 +6573,13 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64_offset: @@ -6640,13 +6640,13 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_i64_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64_neg_offset: @@ -6703,13 +6703,13 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64: @@ -7005,12 +7005,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_i64_offset: @@ -7057,12 +7057,12 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_i64: @@ -7320,12 +7320,12 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_inc_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -7528,12 +7528,12 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_dec_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 8897ad3e950a5..9af7e0978f9db 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -5064,37 +5064,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s7 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -5300,37 +5300,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s7 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6526,37 +6526,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s7 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6650,37 +6650,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s7 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -8862,37 +8862,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s7 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -8972,29 +8972,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -9087,37 +9087,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s6 -; GFX9-NEXT: s_addc_u32 s1, s1, s7 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 297b5180dfe9b..5abd4c9069c91 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -35,8 +35,8 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 -; CHECK-NEXT: s_movk_i32 s4, 0x130 -; CHECK-NEXT: s_mov_b32 s5, s24 +; CHECK-NEXT: s_movk_i32 s20, 0x130 +; CHECK-NEXT: s_mov_b32 s21, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v4, s36, 0 ; CHECK-NEXT: v_writelane_b32 v4, s37, 1 @@ -49,7 +49,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v4, s44, 8 ; CHECK-NEXT: v_writelane_b32 v4, s45, 9 ; CHECK-NEXT: v_writelane_b32 v4, s46, 10 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; CHECK-NEXT: v_writelane_b32 v4, s47, 11 ; CHECK-NEXT: v_writelane_b32 v4, s48, 12 ; CHECK-NEXT: v_writelane_b32 v4, s49, 13 @@ -78,17 +78,17 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v4, s13, 25 ; CHECK-NEXT: v_writelane_b32 v4, s14, 26 ; CHECK-NEXT: v_writelane_b32 v4, s15, 27 -; CHECK-NEXT: v_writelane_b32 v4, s16, 28 ; CHECK-NEXT: v_writelane_b32 v8, s52, 18 -; CHECK-NEXT: v_writelane_b32 v4, s17, 29 +; CHECK-NEXT: v_writelane_b32 v4, s16, 28 ; CHECK-NEXT: v_writelane_b32 v8, s53, 19 -; CHECK-NEXT: v_writelane_b32 v4, s18, 30 +; CHECK-NEXT: v_writelane_b32 v4, s17, 29 ; CHECK-NEXT: v_writelane_b32 v8, s54, 20 -; CHECK-NEXT: v_writelane_b32 v4, s19, 31 -; CHECK-NEXT: s_mov_b32 s4, 48 -; CHECK-NEXT: s_mov_b32 s5, s24 +; CHECK-NEXT: v_writelane_b32 v4, s18, 30 +; CHECK-NEXT: s_mov_b32 s26, 48 +; CHECK-NEXT: s_mov_b32 s27, s24 ; CHECK-NEXT: v_writelane_b32 v8, s55, 21 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v4, s19, 31 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0 ; CHECK-NEXT: v_writelane_b32 v8, s56, 22 ; CHECK-NEXT: v_writelane_b32 v8, s57, 23 ; CHECK-NEXT: v_writelane_b32 v8, s58, 24 @@ -107,15 +107,15 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v8, s65, 31 ; CHECK-NEXT: v_writelane_b32 v4, s9, 37 ; CHECK-NEXT: v_writelane_b32 v8, s66, 32 -; CHECK-NEXT: s_movk_i32 s26, 0x1f0 -; CHECK-NEXT: s_movk_i32 s28, 0x2f0 -; CHECK-NEXT: s_mov_b32 s27, s24 +; CHECK-NEXT: s_movk_i32 s28, 0x1f0 +; CHECK-NEXT: s_movk_i32 s30, 0x2f0 ; CHECK-NEXT: s_mov_b32 s29, s24 +; CHECK-NEXT: s_mov_b32 s31, s24 ; CHECK-NEXT: v_writelane_b32 v4, s10, 38 ; CHECK-NEXT: v_writelane_b32 v8, s67, 33 ; CHECK-NEXT: v_writelane_b32 v4, s11, 39 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 78653d7e21ad8..81210d8f5d0ca 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -638,9 +638,9 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s4, 0x400 +; GFX9-NEXT: s_movk_i32 s2, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff @@ -651,14 +651,14 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 ; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 ; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: global_store_short v5, v4, s[2:3] +; GFX9-NEXT: global_store_short v5, v4, s[4:5] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 1d68b0ba0a280..fee59455da4c8 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -2044,13 +2044,13 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2062,20 +2062,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -2084,20 +2084,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v3, v1, s1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2107,7 +2106,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0 -; GFX10-DL-NEXT: global_store_dword v2, v1, s[6:7] +; GFX10-DL-NEXT: global_store_dword v2, v1, s[10:11] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3src: @@ -2250,13 +2249,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 @@ -2267,21 +2266,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -2291,20 +2290,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2315,7 +2313,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 -; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7] +; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele: @@ -2856,13 +2854,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 8, 8 @@ -2873,21 +2871,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 @@ -2897,20 +2895,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2921,7 +2918,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 -; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7] +; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index fb94b504781b1..2894ae76c0be4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -3494,13 +3494,13 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -3512,20 +3512,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -3534,20 +3534,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v3, v1, s1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -3556,7 +3555,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[10:11] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_3src: @@ -3700,13 +3699,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 @@ -3717,21 +3716,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -3741,20 +3740,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -3764,7 +3762,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele: @@ -4306,13 +4304,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1] -; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] +; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 @@ -4323,21 +4321,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 @@ -4347,20 +4345,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -4370,7 +4367,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020101 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0: @@ -5597,15 +5594,16 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_v8i8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v8i8: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 3cabe41afb05a..44e8ae01fd692 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) { ; GCN-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1 ; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 ; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 77722d96d5a4c..eed85345b3b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -281,24 +281,24 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: @@ -361,24 +361,24 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll index 224de9512c493..67e4feed21fac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,PREGFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX8,PREGFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s @@ -549,12 +549,20 @@ end: ; GCN-LABEL: {{^}}test_export_clustering: ; PREGFX11-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0 ; PREGFX11-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0 -; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0 -; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1 -; PREGFX11-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]] -; PREGFX11-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]] -; PREGFX11: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}} -; PREGFX11-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}} + +; GFX8-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s1 +; GFX8-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s0 +; GFX8-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]] +; GFX8-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]] +; GFX8: exp param0 [[Y]], [[X]], [[Z0]], [[W0]]{{$}} +; GFX8-NEXT: exp param1 [[Y]], [[X]], [[Z1]], [[W1]] done{{$}} + +; GFX10-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0 +; GFX10-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1 +; GFX10-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]] +; GFX10-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]] +; GFX10: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}} +; GFX10-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}} define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 { %z0 = fadd float %x, %y call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index d70df38fe6037..dad77d1efd3a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -973,12 +973,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: @@ -996,12 +996,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -1023,12 +1023,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: @@ -1046,12 +1046,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -1073,12 +1073,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: @@ -1096,12 +1096,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -1123,12 +1123,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: @@ -1146,12 +1146,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -1173,12 +1173,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: @@ -1196,12 +1196,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -1223,12 +1223,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: @@ -1246,12 +1246,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -1273,12 +1273,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: @@ -1296,12 +1296,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -1323,12 +1323,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: @@ -1346,12 +1346,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -1373,12 +1373,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: @@ -1396,12 +1396,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -1423,12 +1423,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: @@ -1446,12 +1446,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -1473,12 +1473,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: @@ -1496,12 +1496,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -1523,12 +1523,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: @@ -1546,12 +1546,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -1573,12 +1573,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: @@ -1596,12 +1596,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -1623,12 +1623,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: @@ -1646,12 +1646,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 734d1472d054f..d1883d9196af6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -1059,15 +1059,15 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: @@ -1119,15 +1119,15 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_one: @@ -1179,15 +1179,15 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: @@ -1239,15 +1239,15 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: @@ -1299,15 +1299,15 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: @@ -1359,15 +1359,15 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: @@ -1419,15 +1419,15 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: @@ -1479,15 +1479,15 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_o: @@ -1539,15 +1539,15 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: @@ -1599,15 +1599,15 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_une: @@ -1659,15 +1659,15 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: @@ -1719,15 +1719,15 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: @@ -1779,15 +1779,15 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: @@ -1839,15 +1839,15 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 78d5da8dda177..453913b334a49 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -45,17 +45,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( ; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11-NEXT: scratch_store_b16 off, v0, s0 ; GFX11-NEXT: s_endpgm -; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: -; GISEL-GFX11: ; %bb.0: ; %entry -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 -; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 -; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s3 -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s0 -; GISEL-GFX11-NEXT: s_endpgm ptr addrspace(5) %r, ptr addrspace(5) %a, ptr addrspace(5) %b, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 746b00ba7b231..d0d759a57a682 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -644,12 +644,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: @@ -667,12 +667,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -694,12 +694,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: @@ -717,12 +717,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -744,12 +744,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: @@ -767,12 +767,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -794,12 +794,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: @@ -817,12 +817,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -844,12 +844,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: @@ -867,12 +867,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -894,12 +894,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: @@ -917,12 +917,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -944,12 +944,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: @@ -967,12 +967,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -994,12 +994,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: @@ -1017,12 +1017,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -1044,12 +1044,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: @@ -1067,12 +1067,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1094,12 +1094,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: @@ -1117,12 +1117,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 2625c1f152219..cad3c54ae54b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -716,15 +716,15 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_eq: @@ -776,15 +776,15 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_ne: @@ -836,15 +836,15 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: @@ -896,15 +896,15 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_uge: @@ -956,15 +956,15 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ult: @@ -1016,15 +1016,15 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ule: @@ -1076,15 +1076,15 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: @@ -1136,15 +1136,15 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sge: @@ -1196,15 +1196,15 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_slt: @@ -1256,15 +1256,15 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sle: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 4930317143a76..0393a551dcd41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -352,9 +352,10 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_clause 0x1 +; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -365,13 +366,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 +; GFX1013-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -447,9 +448,10 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_clause 0x1 +; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -457,13 +459,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 +; GFX1013-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index e2f494283a3f2..7371d498a7070 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -808,11 +808,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 5304188e02f84..60af21524a04a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -809,11 +809,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 5514efa6838e7..0b18e5f35a310 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -46,26 +46,26 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX9-NEXT: v_cos_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cos_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX10-NEXT: v_cos_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cos_f16: @@ -142,34 +142,34 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_cos_f16_e32 v2, v3 ; GFX9-NEXT: v_cos_f16_e32 v1, v1 ; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cos_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_cos_f16_e32 v2, v3 ; GFX10-NEXT: v_cos_f16_e32 v1, v1 ; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cos_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index acbb8684da924..32b599e63c61d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -538,10 +538,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -562,7 +562,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v3f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index edcdd323cb0ae..92658e660ea67 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -104,60 +104,60 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3 -; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4 -; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8 +; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0 -; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1 +; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s18, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s19, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s22, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s23, s11 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s18, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s19, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s22, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s23, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s12, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s13, s3 -; GFX10-DENORM-NEXT: s_mov_b32 s16, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s17, s5 -; GFX10-DENORM-NEXT: s_mov_b32 s20, s6 -; GFX10-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s16, s8 +; GFX10-DENORM-NEXT: s_mov_b32 s17, s9 +; GFX10-DENORM-NEXT: s_mov_b32 s20, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s21, s11 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0 -; GFX10-DENORM-NEXT: s_mov_b32 s8, s0 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s1 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16: @@ -722,60 +722,60 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-FLUSH-LABEL: fmuladd_v2f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3 -; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4 -; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8 +; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9 ; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0 -; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1 +; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_v2f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s18, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s19, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s22, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s23, s11 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s18, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s19, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s22, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s23, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s12, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s13, s3 -; GFX10-DENORM-NEXT: s_mov_b32 s16, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s17, s5 -; GFX10-DENORM-NEXT: s_mov_b32 s20, s6 -; GFX10-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s16, s8 +; GFX10-DENORM-NEXT: s_mov_b32 s17, s9 +; GFX10-DENORM-NEXT: s_mov_b32 s20, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s21, s11 ; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0 -; GFX10-DENORM-NEXT: s_mov_b32 s8, s0 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s1 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2 -; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index a807885e0d853..e05f3f1e65ff3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -139,12 +139,12 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f64: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index bc4d03e7af260..50c52037dc4d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -630,10 +630,10 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -654,7 +654,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index c7913f638798a..2c9ce001b8c4d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -194,40 +194,40 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX9-LABEL: maxnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16_imm_a: @@ -302,40 +302,40 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX9-LABEL: maxnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16_imm_b: @@ -524,29 +524,29 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16_imm_a: @@ -614,29 +614,29 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16_imm_b: @@ -1007,36 +1007,36 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fmax_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 ; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fmax_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 0a004fd7701cf..59508e049a3a9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -221,40 +221,40 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX9-LABEL: minnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_imm_a: @@ -328,40 +328,40 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX9-LABEL: minnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_imm_b: @@ -583,29 +583,29 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_imm_a: @@ -672,29 +672,29 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_imm_b: @@ -1062,36 +1062,36 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fmin_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 ; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fmin_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 53ea253035655..2d3b18dcb121c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: umulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s0, s3 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX9-NEXT: s_add_u32 s9, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s1, s2 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX9-NEXT: s_add_u32 s9, s9, s6 -; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3 -; GFX9-NEXT: s_addc_u32 s4, s5, s4 -; GFX9-NEXT: s_addc_u32 s5, s10, 0 -; GFX9-NEXT: s_mul_i32 s1, s1, s3 -; GFX9-NEXT: s_add_u32 s4, s4, s1 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_i32 s1, s8, s7 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_cselect_b32 s1, 0, s1 -; GFX9-NEXT: s_cselect_b32 s0, 0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mul_i32 s3, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7 +; GFX9-NEXT: s_add_u32 s9, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s5, s6 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6 +; GFX9-NEXT: s_add_u32 s9, s9, s2 +; GFX9-NEXT: s_mul_hi_u32 s10, s5, s7 +; GFX9-NEXT: s_addc_u32 s0, s1, s0 +; GFX9-NEXT: s_addc_u32 s1, s10, 0 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s5 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_add_i32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_i32 s2, s4, s6 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cselect_b32 s0, 0, s3 +; GFX9-NEXT: s_cselect_b32 s1, 0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: umulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s7, s0, s3 -; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX10-NEXT: s_mul_i32 s6, s1, s2 -; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3 -; GFX10-NEXT: s_mul_i32 s1, s1, s3 -; GFX10-NEXT: s_add_u32 s3, s8, s7 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 -; GFX10-NEXT: s_add_u32 s3, s3, s6 -; GFX10-NEXT: s_addc_u32 s3, s5, s4 -; GFX10-NEXT: s_addc_u32 s5, s9, 0 -; GFX10-NEXT: s_add_u32 s4, s3, s1 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 -; GFX10-NEXT: s_add_i32 s1, s8, s7 -; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s6 -; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX10-NEXT: s_cselect_b32 s0, 0, s0 -; GFX10-NEXT: s_cselect_b32 s1, 0, s1 +; GFX10-NEXT: s_mul_i32 s3, s4, s7 +; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7 +; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6 +; GFX10-NEXT: s_mul_i32 s2, s5, s6 +; GFX10-NEXT: s_mul_hi_u32 s9, s5, s7 +; GFX10-NEXT: s_mul_i32 s5, s5, s7 +; GFX10-NEXT: s_add_u32 s7, s8, s3 +; GFX10-NEXT: s_addc_u32 s1, 0, s1 +; GFX10-NEXT: s_add_u32 s7, s7, s2 +; GFX10-NEXT: s_addc_u32 s0, s1, s0 +; GFX10-NEXT: s_addc_u32 s1, s9, 0 +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, 0, s1 +; GFX10-NEXT: s_add_i32 s3, s8, s3 +; GFX10-NEXT: s_mul_i32 s4, s4, s6 +; GFX10-NEXT: s_add_i32 s3, s3, s2 +; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10-NEXT: s_cselect_b32 s0, 0, s4 +; GFX10-NEXT: s_cselect_b32 s1, 0, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off @@ -540,81 +540,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: smulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s0, s3 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX9-NEXT: s_add_u32 s9, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s1, s2 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX9-NEXT: s_add_u32 s9, s9, s6 -; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3 -; GFX9-NEXT: s_addc_u32 s4, s5, s4 -; GFX9-NEXT: s_addc_u32 s5, s10, 0 -; GFX9-NEXT: s_mul_i32 s9, s1, s3 -; GFX9-NEXT: s_add_u32 s4, s4, s9 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_sub_u32 s9, s4, s2 -; GFX9-NEXT: s_subb_u32 s10, s5, 0 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cselect_b32 s1, s10, s5 -; GFX9-NEXT: s_sub_u32 s9, s4, s0 -; GFX9-NEXT: s_subb_u32 s5, s1, 0 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s5, s5, s1 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_add_i32 s1, s8, s7 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] -; GFX9-NEXT: s_cselect_b32 s1, 0, s1 -; GFX9-NEXT: s_cselect_b32 s0, 0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mul_i32 s3, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7 +; GFX9-NEXT: s_add_u32 s9, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s5, s6 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6 +; GFX9-NEXT: s_add_u32 s9, s9, s2 +; GFX9-NEXT: s_mul_hi_i32 s10, s5, s7 +; GFX9-NEXT: s_addc_u32 s0, s1, s0 +; GFX9-NEXT: s_addc_u32 s1, s10, 0 +; GFX9-NEXT: s_mul_i32 s9, s5, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, 0, s1 +; GFX9-NEXT: s_sub_u32 s9, s0, s6 +; GFX9-NEXT: s_subb_u32 s10, s1, 0 +; GFX9-NEXT: s_cmp_lt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_cselect_b32 s1, s10, s1 +; GFX9-NEXT: s_sub_u32 s5, s0, s4 +; GFX9-NEXT: s_subb_u32 s9, s1, 0 +; GFX9-NEXT: s_cmp_lt_i32 s7, 0 +; GFX9-NEXT: s_cselect_b32 s1, s9, s1 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_add_i32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s5, s3, s2 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], s[2:3] +; GFX9-NEXT: s_cselect_b32 s0, 0, s5 +; GFX9-NEXT: s_cselect_b32 s1, 0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: smulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s7, s0, s3 -; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX10-NEXT: s_mul_i32 s6, s1, s2 -; GFX10-NEXT: s_add_u32 s11, s8, s7 -; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 -; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 -; GFX10-NEXT: s_add_u32 s11, s11, s6 -; GFX10-NEXT: s_mul_i32 s10, s1, s3 -; GFX10-NEXT: s_addc_u32 s4, s5, s4 -; GFX10-NEXT: s_addc_u32 s5, s9, 0 -; GFX10-NEXT: s_add_u32 s4, s4, s10 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 -; GFX10-NEXT: s_sub_u32 s9, s4, s2 -; GFX10-NEXT: s_subb_u32 s10, s5, 0 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s1, s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s10, s5 -; GFX10-NEXT: s_sub_u32 s9, s1, s0 -; GFX10-NEXT: s_subb_u32 s5, s4, 0 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s5, s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s1 -; GFX10-NEXT: s_add_i32 s1, s8, s7 -; GFX10-NEXT: s_add_i32 s1, s1, s6 -; GFX10-NEXT: s_ashr_i32 s6, s1, 31 -; GFX10-NEXT: s_mov_b32 s7, s6 -; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] -; GFX10-NEXT: s_cselect_b32 s0, 0, s0 -; GFX10-NEXT: s_cselect_b32 s1, 0, s1 +; GFX10-NEXT: s_mul_i32 s3, s4, s7 +; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7 +; GFX10-NEXT: s_mul_i32 s2, s5, s6 +; GFX10-NEXT: s_add_u32 s11, s8, s3 +; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6 +; GFX10-NEXT: s_addc_u32 s1, 0, s1 +; GFX10-NEXT: s_mul_hi_i32 s9, s5, s7 +; GFX10-NEXT: s_add_u32 s11, s11, s2 +; GFX10-NEXT: s_mul_i32 s10, s5, s7 +; GFX10-NEXT: s_addc_u32 s0, s1, s0 +; GFX10-NEXT: s_addc_u32 s1, s9, 0 +; GFX10-NEXT: s_add_u32 s0, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, 0, s1 +; GFX10-NEXT: s_sub_u32 s9, s0, s6 +; GFX10-NEXT: s_subb_u32 s10, s1, 0 +; GFX10-NEXT: s_cmp_lt_i32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s0, s9, s0 +; GFX10-NEXT: s_cselect_b32 s1, s10, s1 +; GFX10-NEXT: s_sub_u32 s5, s0, s4 +; GFX10-NEXT: s_subb_u32 s9, s1, 0 +; GFX10-NEXT: s_cmp_lt_i32 s7, 0 +; GFX10-NEXT: s_mul_i32 s4, s4, s6 +; GFX10-NEXT: s_cselect_b32 s1, s9, s1 +; GFX10-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10-NEXT: s_add_i32 s3, s8, s3 +; GFX10-NEXT: s_add_i32 s5, s3, s2 +; GFX10-NEXT: s_ashr_i32 s2, s5, 31 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_cmp_lg_u64 s[0:1], s[2:3] +; GFX10-NEXT: s_cselect_b32 s0, 0, s4 +; GFX10-NEXT: s_cselect_b32 s1, 0, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 47dd0263d020e..bf7dbcde62fdd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -28,23 +28,41 @@ define amdgpu_kernel void @rint_f16( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: rint_f16: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_rndne_f16_e32 v0, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: rint_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_rndne_f16_e32 v0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: rint_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: rint_f16: ; GFX11: ; %bb.0: ; %entry @@ -131,22 +149,22 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX9-LABEL: rint_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rndne_f16_e32 v1, v0 ; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: rint_v2f16: @@ -180,3 +198,5 @@ entry: store <2 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX89: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 9327d76e50692..e7b17c30cf753 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s @@ -24,23 +24,41 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX89-NEXT: s_brev_b32 s4, -2 -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: @@ -261,44 +279,83 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v4f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s10, -2 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s7 -; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s5 -; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v4 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s4 -; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s10, v4, v5 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_brev_b32 s10, -2 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s7 +; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s5 +; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s4 +; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_brev_b32 s10, -2 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s5 +; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s4 +; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: @@ -447,73 +504,141 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v8f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX89-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s2, -2 -; GFX89-NEXT: s_mov_b32 s15, 0xf000 -; GFX89-NEXT: s_mov_b32 s14, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s7 -; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s5 -; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s4 -; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX89-NEXT: v_trunc_f32_e32 v4, s11 -; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v6, s11 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 -; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 -; GFX89-NEXT: v_trunc_f32_e32 v4, s10 -; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v6, s10 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 -; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 -; GFX89-NEXT: v_trunc_f32_e32 v4, s9 -; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v8, s9 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8 -; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 -; GFX89-NEXT: v_trunc_f32_e32 v4, s8 -; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v9, s8 -; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9 -; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v8f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; GFX8-NEXT: s_brev_b32 s2, -2 +; GFX8-NEXT: s_mov_b32 s15, 0xf000 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s7 +; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s5 +; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s4 +; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_trunc_f32_e32 v4, s11 +; GFX8-NEXT: v_sub_f32_e32 v5, s11, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX8-NEXT: v_add_f32_e32 v7, v4, v5 +; GFX8-NEXT: v_trunc_f32_e32 v4, s10 +; GFX8-NEXT: v_sub_f32_e32 v5, s10, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX8-NEXT: v_add_f32_e32 v6, v4, v5 +; GFX8-NEXT: v_trunc_f32_e32 v4, s9 +; GFX8-NEXT: v_sub_f32_e32 v5, s9, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX8-NEXT: v_add_f32_e32 v5, v4, v5 +; GFX8-NEXT: v_trunc_f32_e32 v4, s8 +; GFX8-NEXT: v_sub_f32_e32 v8, s8, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s5 +; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s4 +; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_trunc_f32_e32 v4, s11 +; GFX9-NEXT: v_sub_f32_e32 v5, s11, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX9-NEXT: v_add_f32_e32 v7, v4, v5 +; GFX9-NEXT: v_trunc_f32_e32 v4, s10 +; GFX9-NEXT: v_sub_f32_e32 v5, s10, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX9-NEXT: v_add_f32_e32 v6, v4, v5 +; GFX9-NEXT: v_trunc_f32_e32 v4, s9 +; GFX9-NEXT: v_sub_f32_e32 v5, s9, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX9-NEXT: v_add_f32_e32 v5, v4, v5 +; GFX9-NEXT: v_trunc_f32_e32 v4, s8 +; GFX9-NEXT: v_sub_f32_e32 v8, s8, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: @@ -672,24 +797,43 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX89-NEXT: s_movk_i32 s5, 0x7fff -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f16_e32 v1, s4 -; GFX89-NEXT: v_sub_f16_e32 v2, s4, v1 -; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_mov_b32_e32 v2, s4 -; GFX89-NEXT: v_bfi_b32 v0, s5, v0, v2 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f16_e32 v1, s4 +; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: s_movk_i32 s5, 0x7fff +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e32 v1, s4 +; GFX9-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index a70f4d8d90065..3ae0cf65eb00f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -46,26 +46,26 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX9-NEXT: v_sin_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sin_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX10-NEXT: v_sin_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sin_f16: @@ -142,34 +142,34 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_sin_f16_e32 v2, v3 ; GFX9-NEXT: v_sin_f16_e32 v1, v1 ; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sin_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_sin_f16_e32 v2, v3 ; GFX10-NEXT: v_sin_f16_e32 v1, v1 ; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sin_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll index e9a1b38eee157..40cfa4e7e4dfc 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll @@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) { define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) { ; GCN-LABEL: add_u64_ss: ; GCN: s_add_u32 -; GCN: s_addc_u32 s1, s1, s3 +; GCN: s_addc_u32 s1, s5, s7 %add = add i64 %v, %a store i64 %add, ptr undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 48df9a0d98d6b..a4bde5c9d8215 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -79,13 +79,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v2i16: @@ -131,13 +131,13 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_lshr_v2i16: @@ -363,13 +363,13 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_imm_v_v2i16: @@ -414,13 +414,13 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_imm_v_v2i16: @@ -450,13 +450,13 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v2i16: @@ -497,13 +497,13 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_v_imm_v2i16: @@ -533,14 +533,14 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: @@ -596,14 +596,14 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_lshr_v4i16: @@ -636,14 +636,14 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v4i16: @@ -689,14 +689,14 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_v_imm_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index 3032b1028dc2d..a3e7bf2caf772 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -35,34 +35,34 @@ define amdgpu_kernel void @mad_u16( ; ; GFX9-LABEL: mad_u16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mad_u16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[8:9] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v3, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v3, v0, s[10:11] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mad_u16: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index a77892c8f5fc7..686797f290b97 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -2513,38 +2513,38 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX9-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_u32 s8, s9 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s4, s8, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: s_cmp_lt_u32 s2, s3 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dword v0, v2, s[8:9] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX10-NEXT: s_load_dword s1, s[14:15], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_u32 s8, s9 -; GFX10-NEXT: s_cselect_b32 s4, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10-NEXT: s_and_b32 s4, s4, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s8, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: global_store_dword v1, v2, s[0:1] -; GFX10-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-NEXT: s_cmp_lt_u32 s0, s1 +; GFX10-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-NEXT: s_cselect_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v1, v2, s[8:9] +; GFX10-NEXT: global_store_byte v1, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: @@ -2665,33 +2665,33 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX9-NEXT: global_load_ushort v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: global_store_short v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0889f8ef6316e..9b44b58c4a01e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -50,40 +50,40 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_mul_v2i32: @@ -201,47 +201,47 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_v4i32: @@ -857,41 +857,41 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_movk_i32 s2, 0x50 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x50 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_sext_c: @@ -1004,41 +1004,41 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_movk_i32 s2, 0x50 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x50 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, s0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_zext_c: @@ -1149,40 +1149,40 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_sext_inline_imm: @@ -1396,38 +1396,38 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i32: @@ -1662,43 +1662,43 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: s_mov_b32 s9, s7 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i1: @@ -2211,67 +2211,67 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s6, s0, s1 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_mul_i32 s8, s0, s1 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s2 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s7 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_branch .LBB15_6 ; GFX9-NEXT: .LBB15_5: -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: .LBB15_6: ; %endif ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s5, s0, s1 +; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: -; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: ; implicit-def: $sgpr5 +; GFX10-NEXT: s_mov_b32 s8, -1 +; GFX10-NEXT: ; implicit-def: $sgpr0 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, s2 -; GFX10-NEXT: s_mov_b32 s5, s3 -; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s0, s6 +; GFX10-NEXT: s_mov_b32 s1, s7 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_branch .LBB15_6 ; GFX10-NEXT: .LBB15_5: -; GFX10-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: .LBB15_6: ; %endif ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul32_in_branch: @@ -2472,72 +2472,72 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s7, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 -; GFX9-NEXT: s_add_i32 s7, s10, s7 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s7, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GFX9-NEXT: s_mul_i32 s2, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s3, s8, s10 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s3, s9, s10 +; GFX9-NEXT: s_add_i32 s3, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s8, s10 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB16_4 ; GFX9-NEXT: .LBB16_2: ; %if -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s4, s2 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_branch .LBB16_5 ; GFX9-NEXT: .LBB16_3: -; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-NEXT: s_branch .LBB16_2 ; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB16_5: ; %endif -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s7, s4, s7 -; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX10-NEXT: s_mul_i32 s5, s5, s6 -; GFX10-NEXT: s_add_i32 s7, s8, s7 -; GFX10-NEXT: s_mul_i32 s4, s4, s6 -; GFX10-NEXT: s_add_i32 s5, s7, s5 +; GFX10-NEXT: s_mul_i32 s0, s8, s11 +; GFX10-NEXT: s_mul_hi_u32 s1, s8, s10 +; GFX10-NEXT: s_mul_i32 s2, s9, s10 +; GFX10-NEXT: s_add_i32 s0, s1, s0 +; GFX10-NEXT: s_add_i32 s1, s0, s2 +; GFX10-NEXT: s_mul_i32 s0, s8, s10 ; GFX10-NEXT: s_cbranch_execnz .LBB16_4 ; GFX10-NEXT: .LBB16_2: ; %if -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s4, s2 -; GFX10-NEXT: s_mov_b32 s5, s3 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s0, s6 +; GFX10-NEXT: s_mov_b32 s1, s7 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_branch .LBB16_5 ; GFX10-NEXT: .LBB16_3: -; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX10-NEXT: s_branch .LBB16_2 ; GFX10-NEXT: .LBB16_4: -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: .LBB16_5: ; %endif -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_in_branch: @@ -3102,12 +3102,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 ; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 @@ -3128,18 +3128,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 ; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2 @@ -3160,7 +3160,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12 ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 342f36b6fa622..a1099554559af 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -158,18 +158,18 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -279,17 +279,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll index d73b1bd29c981..7e5def00ee7cb 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll @@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x8000000000000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_b32 s1, s1, 0x80000000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s1, s5, 0x80000000 ; GCN-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GCN-NEXT: s_cselect_b32 s0, 22, 33 ; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: global_store_dword v0, v1, s[6:7] ; GCN-NEXT: s_endpgm %and = and i64 %arg, 9223372036854775808 %cmp = icmp eq i64 %and, 0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index 79082a54c6a36..d6fc52bdb9322 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -299,15 +299,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -330,15 +330,15 @@ define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -362,14 +362,14 @@ define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrs ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1] -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: global_store_dword v2, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1 @@ -389,15 +389,15 @@ define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, pt ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -420,15 +420,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace( ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -453,15 +453,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out, ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -483,15 +483,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %o ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -514,15 +514,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrs ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -544,15 +544,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrs ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -574,15 +574,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrs ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -604,15 +604,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrs ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -634,16 +634,16 @@ define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: v_pk_add_f16 v0, v0, v1 -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: global_store_dword v2, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4 @@ -661,16 +661,16 @@ define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr a ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0] -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: global_store_dword v2, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4 @@ -689,18 +689,18 @@ define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v3, 0xffff -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v2, v0 -; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: global_store_dword v1, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 @@ -726,14 +726,14 @@ define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addr ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v3, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -764,7 +764,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: ds_read_b32 v2, v0 offset:4 ; GCN-NEXT: ds_read_b32 v0, v0 offset:8 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: ; kill: killed $vgpr0_vgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v3, v[0:1], off glc @@ -776,7 +776,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[2:3] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll index f53ca53518a17..40542bc6f05a7 100644 --- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 { ; GCN-LABEL: dbg_clause: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v0, s[2:3] +; GCN-NEXT: global_load_dword v1, v0, s[6:7] ; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1 -; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32 +; GCN-NEXT: global_load_dword v2, v0, s[6:7] offset:32 ; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, v1, v2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: global_store_dword v0, v1, s[4:5] ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll index dabb9d43bf3d6..0c8dbd1db8ec6 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll @@ -21,7 +21,7 @@ define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0 ; GCN-LABEL: load_v3i32_align8: ; GCN: ; %bb.0: ; GCN: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0 %vec = load <3 x i32>, ptr addrspace(1) %arg, align 8 store <3 x i32> %vec, ptr addrspace(1) undef, align 8 ret void @@ -52,7 +52,7 @@ define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0 ; GCN-LABEL: load_v3f32_align8: ; GCN: ; %bb.0: ; GCN: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0 %vec = load <3 x float>, ptr addrspace(1) %arg, align 8 store <3 x float> %vec, ptr addrspace(1) undef, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 011bb332ddd0a..f916c9375bc6d 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 -; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 -; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 +; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12 +; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 -; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 -; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12 +; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12 ; GISEL-NEXT: s_endpgm %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; SDAG-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4 ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SDAG-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GISEL-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GISEL-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8 ; GISEL-NEXT: s_endpgm %lane = call i32 @llvm.amdgcn.workitem.id.x() %idx = shl i32 %lane, 2 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 684279a3776fc..57c936d4689eb 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -288,35 +288,35 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_saddo_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-NEXT: global_load_dword v2, v0, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v2, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_i32: @@ -401,38 +401,38 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s8, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_addc_u32 s9, s5, s7 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: s_add_u32 s0, s8, s10 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc +; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v2, v0, s[2:3] +; GFX9-NEXT: global_store_byte v2, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s8, s4, s6 -; GFX10-NEXT: s_addc_u32 s9, s5, s7 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_xor_b32 s4, s6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX10-NEXT: global_store_byte v2, v3, s[2:3] +; GFX10-NEXT: s_add_u32 s0, s8, s10 +; GFX10-NEXT: s_addc_u32 s1, s9, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_xor_b32 s0, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_byte v2, v3, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_saddo_i64: @@ -656,11 +656,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v5, v1, v3 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp @@ -670,18 +670,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_saddo_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3 ; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp @@ -691,8 +691,8 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1] -; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_v2i32: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index b57a51f1382ae..c10600a78e3e7 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -241,23 +241,23 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32_4: @@ -339,25 +339,25 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: slow_sdiv_i32_3435: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 +; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: slow_sdiv_i32_3435: @@ -732,17 +732,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v2i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v2i32_4: @@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v4i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v4i32_4: @@ -1515,18 +1515,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i8: @@ -2253,21 +2253,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; GFX9-LABEL: scalarize_mulhs_4xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 -; GFX9-NEXT: s_mov_b32 s4, s2 -; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, 0x1389c755 +; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 -; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 -; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 -; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 +; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4 +; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4 +; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4 +; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: scalarize_mulhs_4xi32: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 669ed915a002a..64b3317edc519 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: add_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in, align 4 %shr = lshr i32 %a, 16 @@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: sub_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in, align 4 %shr = lshr i32 %a, 16 @@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX9-LABEL: sitofp_v2i16_to_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1 ; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sitofp_v2i16_to_v2f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1 ; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: immediate_mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x141007b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] -; GFX9-NEXT: s_mov_b32 s2, 0x141007b +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: immediate_mul_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1929,12 +1929,12 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX9-LABEL: pulled_out_test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1950,18 +1950,18 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: pulled_out_test: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1975,7 +1975,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm entry: %idxprom = ashr exact i64 15, 32 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 4952f80c0b411..9b9f03ff74aa3 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -90,13 +90,13 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v2i16: @@ -142,13 +142,13 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_shl_v2i16: @@ -374,13 +374,13 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_imm_v_v2i16: @@ -426,13 +426,13 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_imm_v_v2i16: @@ -462,13 +462,13 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v2i16: @@ -510,13 +510,13 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_v_imm_v2i16: @@ -546,14 +546,14 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: @@ -609,14 +609,14 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_shl_v4i16: @@ -649,14 +649,14 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v4i16: @@ -708,14 +708,14 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_v_imm_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index ddf331816694a..88e2bb772a2d3 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -84,24 +84,24 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_x_sub_64: @@ -223,35 +223,35 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -359,24 +359,24 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_64_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_64_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_64_sub_x: @@ -474,46 +474,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: @@ -626,24 +626,24 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_65_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_65_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_65_sub_x: @@ -741,46 +741,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: @@ -893,24 +893,24 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_neg16_sub_x: @@ -1008,46 +1008,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: @@ -1160,24 +1160,24 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_neg17_sub_x: @@ -1330,24 +1330,24 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i16_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i16_x_sub_64: @@ -1451,27 +1451,27 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: @@ -1597,35 +1597,35 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v2, s[0:1] +; GFX9-NEXT: global_store_short v0, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-NEXT: global_store_short v0, v2, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; @@ -1748,24 +1748,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_64: @@ -1878,37 +1878,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x400007 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x400007 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_64: @@ -2021,37 +2021,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x7b0040 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x7b0040 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_123: @@ -2161,24 +2161,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_0: @@ -2286,24 +2286,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_16: @@ -2410,37 +2410,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_brev_b32 s0, 35 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_brev_b32 s2, 35 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0: @@ -2547,37 +2547,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_brev_b32 s0, 34 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_brev_b32 s2, 34 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x44000000 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0: @@ -2691,24 +2691,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: @@ -2815,24 +2815,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: @@ -2941,24 +2941,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: @@ -3072,24 +3072,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: @@ -3196,24 +3196,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: @@ -3322,24 +3322,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: @@ -3452,48 +3452,48 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_movk_i32 s0, 0xc400 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -3621,48 +3621,48 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x4400 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400 +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -3790,24 +3790,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: @@ -3920,24 +3920,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: @@ -4043,24 +4043,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: @@ -4163,47 +4163,47 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_not_b32_e32 v2, 31 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 52db7fea08e05..c9c00a84e0f4b 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -92,8 +92,8 @@ entry: ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; GFX9_10: s_add_u32 s2, s2, -4 -; GFX9_10: s_addc_u32 s3, s3, -1 +; GFX9_10: s_add_u32 s0, s6, -4 +; GFX9_10: s_addc_u32 s1, s7, -1 ; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index abf013e39eefa..7b0241984a349 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -7,20 +7,20 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_movk_i32 s0, 0x4925 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[2:3] -; GCN-NEXT: s_movk_i32 s2, 0x4925 +; GCN-NEXT: global_load_ushort v1, v0, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s2 +; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 31, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 17, v2 ; GCN-NEXT: v_add_u16_e32 v2, v2, v3 ; GCN-NEXT: v_mul_lo_u16_e32 v2, 7, v2 ; GCN-NEXT: v_sub_u16_e32 v1, v1, v2 -; GCN-NEXT: global_store_short v0, v1, s[0:1] +; GCN-NEXT: global_store_short v0, v1, s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i16_7: @@ -113,38 +113,38 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-NEXT: s_abs_i32 s2, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_sub_i32 s5, 0, s2 -; GCN-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-NEXT: v_readfirstlane_b32 s0, v1 +; GCN-NEXT: s_abs_i32 s0, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_sub_i32 s3, 0, s0 +; GCN-NEXT: s_ashr_i32 s2, s1, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: s_abs_i32 s3, s3 +; GCN-NEXT: s_abs_i32 s1, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s6 -; GCN-NEXT: s_mul_hi_u32 s5, s6, s5 -; GCN-NEXT: s_add_i32 s6, s6, s5 -; GCN-NEXT: s_mul_hi_u32 s5, s3, s6 -; GCN-NEXT: s_mul_i32 s5, s5, s2 -; GCN-NEXT: s_sub_i32 s3, s3, s5 -; GCN-NEXT: s_sub_i32 s5, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s3, s5, s3 -; GCN-NEXT: s_sub_i32 s5, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s2, s5, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s4 -; GCN-NEXT: s_sub_i32 s2, s2, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: s_mul_i32 s3, s3, s6 +; GCN-NEXT: s_mul_hi_u32 s3, s6, s3 +; GCN-NEXT: s_add_i32 s6, s6, s3 +; GCN-NEXT: s_mul_hi_u32 s3, s1, s6 +; GCN-NEXT: s_mul_i32 s3, s3, s0 +; GCN-NEXT: s_sub_i32 s1, s1, s3 +; GCN-NEXT: s_sub_i32 s3, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s1, s3, s1 +; GCN-NEXT: s_sub_i32 s3, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s0, s3, s1 +; GCN-NEXT: s_xor_b32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: global_store_dword v2, v0, s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i32: @@ -277,17 +277,17 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v0, s[2:3] +; GCN-NEXT: global_load_dword v1, v0, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 ; GCN-NEXT: v_add_u32_e32 v2, v1, v2 ; GCN-NEXT: v_and_b32_e32 v2, -4, v2 ; GCN-NEXT: v_sub_u32_e32 v1, v1, v2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: global_store_dword v0, v1, s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i32_4: @@ -363,20 +363,20 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s0, 0x92492493 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v0, s[2:3] -; GCN-NEXT: s_mov_b32 s2, 0x92492493 +; GCN-NEXT: global_load_dword v1, v0, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v2, v1, s2 +; GCN-NEXT: v_mul_hi_i32 v2, v1, s0 ; GCN-NEXT: v_add_u32_e32 v2, v2, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 31, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, 7 ; GCN-NEXT: v_sub_u32_e32 v1, v1, v2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: global_store_dword v0, v1, s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i32_7: @@ -459,64 +459,64 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-NEXT: s_abs_i32 s2, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_sub_i32 s6, 0, s2 -; GCN-NEXT: s_ashr_i32 s5, s3, 31 +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: s_abs_i32 s0, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_sub_i32 s6, 0, s0 +; GCN-NEXT: s_ashr_i32 s3, s1, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: s_abs_i32 s3, s3 -; GCN-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NEXT: s_abs_i32 s1, s1 +; GCN-NEXT: v_readfirstlane_b32 s2, v3 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s3, s7 -; GCN-NEXT: s_mul_i32 s6, s6, s2 -; GCN-NEXT: s_sub_i32 s3, s3, s6 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s2, s6, s3 -; GCN-NEXT: s_abs_i32 s3, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s5 -; GCN-NEXT: s_sub_i32 s7, 0, s3 -; GCN-NEXT: s_sub_i32 s2, s2, s5 +; GCN-NEXT: s_mul_hi_u32 s6, s1, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s0 +; GCN-NEXT: s_sub_i32 s1, s1, s6 +; GCN-NEXT: s_sub_i32 s6, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s1, s6, s1 +; GCN-NEXT: s_sub_i32 s6, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s0, s6, s1 +; GCN-NEXT: s_abs_i32 s1, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GCN-NEXT: s_xor_b32 s0, s0, s3 +; GCN-NEXT: s_sub_i32 s7, 0, s1 +; GCN-NEXT: s_sub_i32 s0, s0, s3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NEXT: s_ashr_i32 s6, s4, 31 -; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-NEXT: s_ashr_i32 s6, s2, 31 +; GCN-NEXT: s_abs_i32 s2, s2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s7, s7, s5 -; GCN-NEXT: s_mul_hi_u32 s7, s5, s7 -; GCN-NEXT: s_add_i32 s5, s5, s7 -; GCN-NEXT: s_mul_hi_u32 s5, s4, s5 -; GCN-NEXT: s_mul_i32 s5, s5, s3 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s3 -; GCN-NEXT: s_cmp_ge_u32 s4, s3 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s3 -; GCN-NEXT: s_cmp_ge_u32 s4, s3 -; GCN-NEXT: s_cselect_b32 s3, s5, s4 -; GCN-NEXT: s_xor_b32 s3, s3, s6 -; GCN-NEXT: s_sub_i32 s3, s3, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s7, s7, s3 +; GCN-NEXT: s_mul_hi_u32 s7, s3, s7 +; GCN-NEXT: s_add_i32 s3, s3, s7 +; GCN-NEXT: s_mul_hi_u32 s3, s2, s3 +; GCN-NEXT: s_mul_i32 s3, s3, s1 +; GCN-NEXT: s_sub_i32 s2, s2, s3 +; GCN-NEXT: s_sub_i32 s3, s2, s1 +; GCN-NEXT: s_cmp_ge_u32 s2, s1 +; GCN-NEXT: s_cselect_b32 s2, s3, s2 +; GCN-NEXT: s_sub_i32 s3, s2, s1 +; GCN-NEXT: s_cmp_ge_u32 s2, s1 +; GCN-NEXT: s_cselect_b32 s1, s3, s2 +; GCN-NEXT: s_xor_b32 s1, s1, s6 +; GCN-NEXT: s_sub_i32 s1, s1, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i32: @@ -723,26 +723,26 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s3, v1 -; GCN-NEXT: s_ashr_i32 s4, s2, 31 -; GCN-NEXT: s_ashr_i32 s5, s3, 31 -; GCN-NEXT: s_lshr_b32 s4, s4, 30 -; GCN-NEXT: s_lshr_b32 s5, s5, 30 -; GCN-NEXT: s_add_i32 s4, s2, s4 -; GCN-NEXT: s_add_i32 s5, s3, s5 -; GCN-NEXT: s_and_b32 s4, s4, -4 -; GCN-NEXT: s_and_b32 s5, s5, -4 -; GCN-NEXT: s_sub_i32 s2, s2, s4 -; GCN-NEXT: s_sub_i32 s3, s3, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_ashr_i32 s2, s0, 31 +; GCN-NEXT: s_ashr_i32 s3, s1, 31 +; GCN-NEXT: s_lshr_b32 s2, s2, 30 +; GCN-NEXT: s_lshr_b32 s3, s3, 30 +; GCN-NEXT: s_add_i32 s2, s0, s2 +; GCN-NEXT: s_add_i32 s3, s1, s3 +; GCN-NEXT: s_and_b32 s2, s2, -4 +; GCN-NEXT: s_and_b32 s3, s3, -4 +; GCN-NEXT: s_sub_i32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s1, s1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i32_4: @@ -842,118 +842,118 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] +; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[5:8], v0, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-NEXT: s_abs_i32 s2, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_sub_i32 s6, 0, s2 +; GCN-NEXT: v_readfirstlane_b32 s0, v1 +; GCN-NEXT: s_abs_i32 s0, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-NEXT: s_sub_i32 s6, 0, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v5 -; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_readfirstlane_b32 s2, v5 +; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: s_abs_i32 s4, s4 -; GCN-NEXT: v_readfirstlane_b32 s3, v2 +; GCN-NEXT: s_abs_i32 s2, s2 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s4, s7 -; GCN-NEXT: s_mul_i32 s6, s6, s2 -; GCN-NEXT: s_sub_i32 s4, s4, s6 -; GCN-NEXT: s_sub_i32 s6, s4, s2 -; GCN-NEXT: s_cmp_ge_u32 s4, s2 -; GCN-NEXT: s_cselect_b32 s4, s6, s4 -; GCN-NEXT: s_sub_i32 s6, s4, s2 -; GCN-NEXT: s_cmp_ge_u32 s4, s2 -; GCN-NEXT: s_cselect_b32 s2, s6, s4 -; GCN-NEXT: s_abs_i32 s3, s3 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s5 -; GCN-NEXT: s_sub_i32 s8, 0, s3 -; GCN-NEXT: s_sub_i32 s2, s2, s5 +; GCN-NEXT: s_mul_hi_u32 s6, s2, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s0 +; GCN-NEXT: s_sub_i32 s2, s2, s6 +; GCN-NEXT: s_sub_i32 s6, s2, s0 +; GCN-NEXT: s_cmp_ge_u32 s2, s0 +; GCN-NEXT: s_cselect_b32 s2, s6, s2 +; GCN-NEXT: s_sub_i32 s6, s2, s0 +; GCN-NEXT: s_cmp_ge_u32 s2, s0 +; GCN-NEXT: s_cselect_b32 s0, s6, s2 +; GCN-NEXT: s_abs_i32 s1, s1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-NEXT: s_xor_b32 s0, s0, s3 +; GCN-NEXT: s_sub_i32 s8, 0, s1 +; GCN-NEXT: s_sub_i32 s0, s0, s3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s6, v6 ; GCN-NEXT: s_ashr_i32 s7, s6, 31 ; GCN-NEXT: s_abs_i32 s6, s6 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s4, v3 -; GCN-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NEXT: s_mul_i32 s8, s8, s5 -; GCN-NEXT: s_mul_hi_u32 s8, s5, s8 -; GCN-NEXT: s_add_i32 s5, s5, s8 -; GCN-NEXT: s_mul_hi_u32 s5, s6, s5 -; GCN-NEXT: s_mul_i32 s5, s5, s3 -; GCN-NEXT: s_sub_i32 s5, s6, s5 -; GCN-NEXT: s_sub_i32 s6, s5, s3 -; GCN-NEXT: s_cmp_ge_u32 s5, s3 -; GCN-NEXT: s_cselect_b32 s5, s6, s5 -; GCN-NEXT: s_sub_i32 s6, s5, s3 -; GCN-NEXT: s_cmp_ge_u32 s5, s3 -; GCN-NEXT: s_cselect_b32 s3, s6, s5 -; GCN-NEXT: s_abs_i32 s4, s4 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GCN-NEXT: s_xor_b32 s3, s3, s7 -; GCN-NEXT: s_sub_i32 s9, 0, s4 -; GCN-NEXT: s_sub_i32 s3, s3, s7 +; GCN-NEXT: v_readfirstlane_b32 s2, v3 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: s_mul_i32 s8, s8, s3 +; GCN-NEXT: s_mul_hi_u32 s8, s3, s8 +; GCN-NEXT: s_add_i32 s3, s3, s8 +; GCN-NEXT: s_mul_hi_u32 s3, s6, s3 +; GCN-NEXT: s_mul_i32 s3, s3, s1 +; GCN-NEXT: s_sub_i32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s1 +; GCN-NEXT: s_cmp_ge_u32 s3, s1 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s1 +; GCN-NEXT: s_cmp_ge_u32 s3, s1 +; GCN-NEXT: s_cselect_b32 s1, s6, s3 +; GCN-NEXT: s_abs_i32 s2, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_xor_b32 s1, s1, s7 +; GCN-NEXT: s_sub_i32 s9, 0, s2 +; GCN-NEXT: s_sub_i32 s1, s1, s7 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s6, v7 ; GCN-NEXT: s_ashr_i32 s8, s6, 31 ; GCN-NEXT: s_abs_i32 s6, s6 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v4 +; GCN-NEXT: v_readfirstlane_b32 s3, v4 ; GCN-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NEXT: s_mul_i32 s9, s9, s7 ; GCN-NEXT: s_mul_hi_u32 s9, s7, s9 ; GCN-NEXT: s_add_i32 s7, s7, s9 ; GCN-NEXT: s_mul_hi_u32 s7, s6, s7 -; GCN-NEXT: s_mul_i32 s7, s7, s4 +; GCN-NEXT: s_mul_i32 s7, s7, s2 ; GCN-NEXT: s_sub_i32 s6, s6, s7 -; GCN-NEXT: s_sub_i32 s7, s6, s4 -; GCN-NEXT: s_cmp_ge_u32 s6, s4 +; GCN-NEXT: s_sub_i32 s7, s6, s2 +; GCN-NEXT: s_cmp_ge_u32 s6, s2 ; GCN-NEXT: s_cselect_b32 s6, s7, s6 -; GCN-NEXT: s_sub_i32 s7, s6, s4 -; GCN-NEXT: s_cmp_ge_u32 s6, s4 -; GCN-NEXT: s_cselect_b32 s4, s7, s6 -; GCN-NEXT: s_abs_i32 s5, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-NEXT: s_sub_i32 s7, s6, s2 +; GCN-NEXT: s_cmp_ge_u32 s6, s2 +; GCN-NEXT: s_cselect_b32 s2, s7, s6 +; GCN-NEXT: s_abs_i32 s3, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GCN-NEXT: v_readfirstlane_b32 s6, v8 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_ashr_i32 s2, s6, 31 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_ashr_i32 s0, s6, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: s_abs_i32 s3, s6 -; GCN-NEXT: s_sub_i32 s6, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_abs_i32 s1, s6 +; GCN-NEXT: s_sub_i32 s6, 0, s3 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: s_xor_b32 s4, s4, s8 -; GCN-NEXT: s_sub_i32 s4, s4, s8 +; GCN-NEXT: s_xor_b32 s2, s2, s8 +; GCN-NEXT: s_sub_i32 s2, s2, s8 ; GCN-NEXT: v_readfirstlane_b32 s7, v3 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s3, s7 -; GCN-NEXT: s_mul_i32 s6, s6, s5 -; GCN-NEXT: s_sub_i32 s3, s3, s6 -; GCN-NEXT: s_sub_i32 s6, s3, s5 -; GCN-NEXT: s_cmp_ge_u32 s3, s5 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s5 -; GCN-NEXT: s_cmp_ge_u32 s3, s5 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_xor_b32 s3, s3, s2 -; GCN-NEXT: s_sub_i32 s2, s3, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GCN-NEXT: s_mul_hi_u32 s6, s1, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s3 +; GCN-NEXT: s_sub_i32 s1, s1, s6 +; GCN-NEXT: s_sub_i32 s6, s1, s3 +; GCN-NEXT: s_cmp_ge_u32 s1, s3 +; GCN-NEXT: s_cselect_b32 s1, s6, s1 +; GCN-NEXT: s_sub_i32 s6, s1, s3 +; GCN-NEXT: s_cmp_ge_u32 s1, s3 +; GCN-NEXT: s_cselect_b32 s1, s6, s1 +; GCN-NEXT: s_xor_b32 s1, s1, s0 +; GCN-NEXT: s_sub_i32 s0, s1, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i32: @@ -1317,40 +1317,40 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s3, v1 -; GCN-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NEXT: s_ashr_i32 s6, s2, 31 -; GCN-NEXT: s_ashr_i32 s7, s3, 31 -; GCN-NEXT: s_ashr_i32 s8, s4, 31 -; GCN-NEXT: s_ashr_i32 s9, s5, 31 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-NEXT: s_ashr_i32 s6, s0, 31 +; GCN-NEXT: s_ashr_i32 s7, s1, 31 +; GCN-NEXT: s_ashr_i32 s8, s2, 31 +; GCN-NEXT: s_ashr_i32 s9, s3, 31 ; GCN-NEXT: s_lshr_b32 s6, s6, 30 ; GCN-NEXT: s_lshr_b32 s7, s7, 30 ; GCN-NEXT: s_lshr_b32 s8, s8, 30 ; GCN-NEXT: s_lshr_b32 s9, s9, 30 -; GCN-NEXT: s_add_i32 s6, s2, s6 -; GCN-NEXT: s_add_i32 s7, s3, s7 -; GCN-NEXT: s_add_i32 s8, s4, s8 -; GCN-NEXT: s_add_i32 s9, s5, s9 +; GCN-NEXT: s_add_i32 s6, s0, s6 +; GCN-NEXT: s_add_i32 s7, s1, s7 +; GCN-NEXT: s_add_i32 s8, s2, s8 +; GCN-NEXT: s_add_i32 s9, s3, s9 ; GCN-NEXT: s_and_b32 s6, s6, -4 ; GCN-NEXT: s_and_b32 s7, s7, -4 ; GCN-NEXT: s_and_b32 s8, s8, -4 ; GCN-NEXT: s_and_b32 s9, s9, -4 -; GCN-NEXT: s_sub_i32 s2, s2, s6 -; GCN-NEXT: s_sub_i32 s3, s3, s7 -; GCN-NEXT: s_sub_i32 s4, s4, s8 -; GCN-NEXT: s_sub_i32 s5, s5, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GCN-NEXT: s_sub_i32 s0, s0, s6 +; GCN-NEXT: s_sub_i32 s1, s1, s7 +; GCN-NEXT: s_sub_i32 s2, s2, s8 +; GCN-NEXT: s_sub_i32 s3, s3, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i32_4: @@ -2589,10 +2589,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 @@ -2601,7 +2601,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_and_b32_e32 v3, -4, v3 ; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 ; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i64_4: @@ -4733,10 +4733,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 @@ -4752,7 +4752,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 ; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i64_4: @@ -8883,11 +8883,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 30, v9 @@ -8918,8 +8918,8 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v15, vcc ; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v12 ; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v16, vcc -; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i64_4: diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index a66226a0ef108..df12e32d971a2 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -144,13 +144,13 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_i32: @@ -208,13 +208,13 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_imm_i32: @@ -272,14 +272,14 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v2i32: @@ -350,17 +350,17 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2 ; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v4i32: @@ -432,16 +432,16 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_i16: @@ -517,14 +517,14 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v2i16: @@ -608,15 +608,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 02cc7d1185cf3..5a821db6ff040 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -279,16 +279,16 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_constant: @@ -312,16 +312,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_constant: @@ -353,16 +353,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: @@ -386,16 +386,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: @@ -426,15 +426,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: @@ -458,16 +458,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: @@ -498,15 +498,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: @@ -529,16 +529,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: @@ -570,15 +570,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: @@ -601,16 +601,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll index e668c1d2b7f3d..d9e0e0298e072 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll @@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4) + ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0 ; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1 ; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 03a1b3598024b..c08571a733cc5 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: global_store_byte v0, v2, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_byte v0, v2, s[6:7] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: global_store_byte v0, v2, s[2:3] +; GFX9-NEXT: global_store_byte v0, v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s6, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_addc_u32 s7, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_add_u32 s0, s8, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] -; GFX9-NEXT: global_store_byte v4, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX9-NEXT: global_store_byte v4, v0, s[6:7] ; GFX9-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 @@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v4, v0, s[2:3] +; GFX9-NEXT: global_store_byte v4, v0, s[6:7] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -508,17 +508,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] +; GFX9-NEXT: global_load_ushort v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v2, v1, v2 ; GFX9-NEXT: v_cmp_lt_u16_e32 vcc, v2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_short v0, v2, s[0:1] -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v2, s[4:5] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -591,18 +591,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_uaddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index dacc986205983..36a0cbd3f0970 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: global_store_byte v0, v2, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_byte v0, v2, s[6:7] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: global_store_byte v0, v2, s[2:3] +; GFX9-NEXT: global_store_byte v0, v2, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s6, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_subb_u32 s7, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_sub_u32 s0, s8, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_subb_u32 s1, s9, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] -; GFX9-NEXT: global_store_byte v4, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX9-NEXT: global_store_byte v4, v0, s[6:7] ; GFX9-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 @@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v4, v0, s[2:3] +; GFX9-NEXT: global_store_byte v4, v0, s[6:7] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -508,17 +508,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] +; GFX9-NEXT: global_load_ushort v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2 ; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_short v0, v2, s[0:1] -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: global_store_short v0, v2, s[4:5] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -591,18 +591,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_usubo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll index 2210b6c0d3c3a..1bb76cf547e25 100644 --- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll @@ -25,16 +25,16 @@ bb: define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_add_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v2, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX9-NEXT: global_load_dword v4, v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 9f6d27802e184..5a7cce39f6103 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; ; GISEL-LABEL: v_pack_b32_v2f16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16_sub: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; ; GISEL-LABEL: v_pack_b32_v2f16_sub: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs define amdgpu_kernel void @fptrunc( ; GCN-LABEL: fptrunc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0x31016000 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0x31016000 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GISEL-LABEL: fptrunc: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GISEL-NEXT: s_mov_b32 s6, -1 +; GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc( define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fabs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fabs: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fneg: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fneg: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll index 02a6024f858e9..973b7f5c04987 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll @@ -25,16 +25,16 @@ bb: define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_sub_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v2, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX9-NEXT: global_load_dword v4, v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 02da6deb96f1f..2a280bcda42f5 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.1.if.then: ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %54, 0, implicit $exec - ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: early-clobber %33:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %33.sub0, killed %54, 0, implicit $exec + ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %33.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec @@ -570,9 +570,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.1.if.then: ; SI-NEXT: successors: %bb.2(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4) + ; SI-NEXT: early-clobber %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4) ; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec - ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1) + ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %10, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1) ; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4) ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3 ; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 7301b341cbc71..33c06e5d1e3a5 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -405,33 +405,33 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB8_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX906-NEXT: .LBB8_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX906-NEXT: s_cbranch_execz .LBB8_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] ; GFX906-NEXT: .LBB8_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -460,28 +460,28 @@ bb.3: define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_zeroinit: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5] +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3] -; GFX906-NEXT: s_mov_b32 s2, 0 -; GFX906-NEXT: s_mov_b32 s3, s2 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7] +; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: s_mov_b32 s5, s4 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_mov_b32_e32 v4, s3 -; GFX906-NEXT: v_mov_b32_e32 v3, s2 +; GFX906-NEXT: v_mov_b32_e32 v3, s4 +; GFX906-NEXT: v_mov_b32_e32 v4, s5 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX906-NEXT: .LBB9_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX906-NEXT: s_cbranch_execz .LBB9_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 @@ -489,12 +489,12 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] ; GFX906-NEXT: .LBB9_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -522,8 +522,9 @@ bb.3: define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_const: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr3 ; GFX906-NEXT: ; implicit-def: $vgpr13 @@ -533,8 +534,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; implicit-def: $vgpr12 ; GFX906-NEXT: ; implicit-def: $vgpr16 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -542,13 +542,13 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB10_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[2:3] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX906-NEXT: v_mov_b32_e32 v1, 1 ; GFX906-NEXT: v_mov_b32_e32 v10, 2 ; GFX906-NEXT: v_mov_b32_e32 v9, 3 @@ -557,7 +557,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_mov_b32_e32 v7, 6 ; GFX906-NEXT: v_mov_b32_e32 v6, 7 ; GFX906-NEXT: v_mov_b32_e32 v5, 8 -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4 ; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v4 @@ -566,7 +566,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX906-NEXT: .LBB10_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX906-NEXT: s_cbranch_execz .LBB10_4 ; GFX906-NEXT: ; %bb.3: ; %bb.2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] ; GFX906-NEXT: v_mov_b32_e32 v3, v1 ; GFX906-NEXT: v_mov_b32_e32 v13, v10 ; GFX906-NEXT: v_mov_b32_e32 v11, v9 @@ -603,7 +603,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -631,31 +631,31 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[0:1] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB11_4 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[2:3] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[6:7] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB11_3 ; GFX906-NEXT: ; %bb.2: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] ; GFX906-NEXT: .LBB11_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: .LBB11_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[6:7] +; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[10:11] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e3cfb5ecaf18e..c3a81771a2790 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -200,30 +200,30 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_load_dword v1, v0, s[4:5] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1 ; GFX1032-NEXT: v_cmp_nle_f32_e64 s0, 1.0, v1 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_load_dword v1, v0, s[4:5] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1 ; GFX1064-NEXT: v_cmp_nle_f32_e64 s[0:1], 1.0, v1 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid @@ -239,30 +239,30 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_load_dword v1, v0, s[4:5] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1 ; GFX1032-NEXT: v_cmp_gt_i32_e64 s0, 1, v1 ; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_load_dword v1, v0, s[4:5] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 ; GFX1064-NEXT: v_cmp_gt_i32_e64 s[0:1], 1, v1 ; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid @@ -278,30 +278,30 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_load_dword v1, v0, s[4:5] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v1 ; GFX1032-NEXT: v_cmp_gt_u32_e64 s0, 2, v1 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_load_dword v1, v0, s[4:5] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1 ; GFX1064-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v1 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid @@ -631,26 +631,26 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 -; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s6 +; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2 -; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s6 +; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -664,26 +664,26 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 -; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s6 +; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2 -; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s6 +; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -697,26 +697,26 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s6, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s6, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -730,18 +730,18 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5] -; GFX1032-NEXT: s_mov_b32 s8, 0 -; GFX1032-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1032-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1032-NEXT: s_sub_u32 s9, 0, s4 +; GFX1032-NEXT: s_sub_u32 s3, 0, s4 ; GFX1032-NEXT: s_subb_u32 s10, 0, s5 ; GFX1032-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1032-NEXT: v_rcp_f32_e32 v0, v0 @@ -753,11 +753,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s0 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s1 +; GFX1032-NEXT: s_mul_i32 s11, s3, s0 +; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s1 ; GFX1032-NEXT: s_mul_i32 s12, s10, s1 ; GFX1032-NEXT: s_add_i32 s11, s13, s11 -; GFX1032-NEXT: s_mul_i32 s14, s9, s1 +; GFX1032-NEXT: s_mul_i32 s14, s3, s1 ; GFX1032-NEXT: s_add_i32 s11, s11, s12 ; GFX1032-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1032-NEXT: s_mul_hi_u32 s15, s0, s14 @@ -777,46 +777,46 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_addc_u32 s0, s0, s11 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s0 -; GFX1032-NEXT: s_mul_hi_u32 s12, s9, s1 +; GFX1032-NEXT: s_mul_i32 s11, s3, s0 +; GFX1032-NEXT: s_mul_hi_u32 s12, s3, s1 ; GFX1032-NEXT: s_mul_i32 s10, s10, s1 ; GFX1032-NEXT: s_add_i32 s11, s12, s11 -; GFX1032-NEXT: s_mul_i32 s9, s9, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, s1 ; GFX1032-NEXT: s_add_i32 s11, s11, s10 -; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX1032-NEXT: s_mul_i32 s13, s0, s9 -; GFX1032-NEXT: s_mul_hi_u32 s9, s1, s9 +; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s3 +; GFX1032-NEXT: s_mul_i32 s13, s0, s3 +; GFX1032-NEXT: s_mul_hi_u32 s3, s1, s3 ; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11 ; GFX1032-NEXT: s_mul_i32 s1, s1, s11 ; GFX1032-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1032-NEXT: s_add_u32 s1, s9, s1 -; GFX1032-NEXT: s_addc_u32 s9, 0, s14 +; GFX1032-NEXT: s_add_u32 s1, s3, s1 +; GFX1032-NEXT: s_addc_u32 s3, 0, s14 ; GFX1032-NEXT: s_add_u32 s1, s1, s13 ; GFX1032-NEXT: s_mul_i32 s11, s0, s11 -; GFX1032-NEXT: s_addc_u32 s1, s9, s12 -; GFX1032-NEXT: s_addc_u32 s9, s10, 0 +; GFX1032-NEXT: s_addc_u32 s1, s3, s12 +; GFX1032-NEXT: s_addc_u32 s3, s10, 0 ; GFX1032-NEXT: s_add_u32 s1, s1, s11 -; GFX1032-NEXT: s_addc_u32 s9, 0, s9 +; GFX1032-NEXT: s_addc_u32 s3, 0, s3 ; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_addc_u32 s0, s0, s9 +; GFX1032-NEXT: s_addc_u32 s0, s0, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 ; GFX1032-NEXT: s_mul_i32 s10, s6, s0 -; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s0 +; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s0 ; GFX1032-NEXT: s_mul_hi_u32 s11, s7, s0 ; GFX1032-NEXT: s_mul_i32 s0, s7, s0 ; GFX1032-NEXT: s_mul_hi_u32 s12, s6, s1 ; GFX1032-NEXT: s_mul_hi_u32 s13, s7, s1 ; GFX1032-NEXT: s_mul_i32 s1, s7, s1 ; GFX1032-NEXT: s_add_u32 s10, s12, s10 -; GFX1032-NEXT: s_addc_u32 s9, 0, s9 +; GFX1032-NEXT: s_addc_u32 s3, 0, s3 ; GFX1032-NEXT: s_add_u32 s1, s10, s1 -; GFX1032-NEXT: s_addc_u32 s1, s9, s13 -; GFX1032-NEXT: s_addc_u32 s9, s11, 0 +; GFX1032-NEXT: s_addc_u32 s1, s3, s13 +; GFX1032-NEXT: s_addc_u32 s3, s11, 0 ; GFX1032-NEXT: s_add_u32 s1, s1, s0 -; GFX1032-NEXT: s_addc_u32 s9, 0, s9 +; GFX1032-NEXT: s_addc_u32 s3, 0, s3 ; GFX1032-NEXT: s_mul_hi_u32 s0, s4, s1 -; GFX1032-NEXT: s_mul_i32 s11, s4, s9 +; GFX1032-NEXT: s_mul_i32 s11, s4, s3 ; GFX1032-NEXT: s_mul_i32 s12, s4, s1 ; GFX1032-NEXT: s_add_i32 s0, s0, s11 ; GFX1032-NEXT: v_sub_co_u32 v0, s11, s6, s12 @@ -836,9 +836,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: s_add_u32 s10, s1, 1 ; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1032-NEXT: s_addc_u32 s12, s9, 0 +; GFX1032-NEXT: s_addc_u32 s12, s3, 0 ; GFX1032-NEXT: s_add_u32 s13, s1, 2 -; GFX1032-NEXT: s_addc_u32 s14, s9, 0 +; GFX1032-NEXT: s_addc_u32 s14, s3, 0 ; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v0 ; GFX1032-NEXT: s_subb_u32 s0, s7, s0 @@ -854,9 +854,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX1032-NEXT: .LBB15_2: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -870,21 +870,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_i32 s0, s0, s1 ; GFX1032-NEXT: s_mul_hi_u32 s0, s6, s0 ; GFX1032-NEXT: s_mul_i32 s1, s0, s4 -; GFX1032-NEXT: s_add_i32 s5, s0, 1 +; GFX1032-NEXT: s_add_i32 s2, s0, 1 ; GFX1032-NEXT: s_sub_i32 s1, s6, s1 -; GFX1032-NEXT: s_sub_i32 s6, s1, s4 +; GFX1032-NEXT: s_sub_i32 s3, s1, s4 ; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 -; GFX1032-NEXT: s_cselect_b32 s1, s6, s1 -; GFX1032-NEXT: s_add_i32 s5, s0, 1 +; GFX1032-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1032-NEXT: s_cselect_b32 s1, s3, s1 +; GFX1032-NEXT: s_add_i32 s2, s0, 1 ; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 ; GFX1032-NEXT: s_mov_b32 s1, 0 -; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 +; GFX1032-NEXT: s_cselect_b32 s0, s2, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: .LBB15_3: ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] offset:16 ; GFX1032-NEXT: s_endpgm ; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -892,9 +892,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] ; GFX1064-NEXT: s_mov_b32 s0, 0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1064-NEXT: s_sub_u32 s9, 0, s4 +; GFX1064-NEXT: s_sub_u32 s3, 0, s4 ; GFX1064-NEXT: s_subb_u32 s10, 0, s5 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 @@ -913,92 +913,92 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s0 +; GFX1064-NEXT: s_mul_i32 s1, s3, s2 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s0 ; GFX1064-NEXT: s_mul_i32 s11, s10, s0 ; GFX1064-NEXT: s_add_i32 s1, s12, s1 -; GFX1064-NEXT: s_mul_i32 s13, s9, s0 +; GFX1064-NEXT: s_mul_i32 s13, s3, s0 ; GFX1064-NEXT: s_add_i32 s1, s1, s11 ; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1064-NEXT: s_mul_i32 s11, s8, s13 +; GFX1064-NEXT: s_mul_hi_u32 s14, s2, s13 +; GFX1064-NEXT: s_mul_i32 s11, s2, s13 ; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 ; GFX1064-NEXT: s_mul_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s15, s8, s1 +; GFX1064-NEXT: s_mul_hi_u32 s15, s2, s1 ; GFX1064-NEXT: s_add_u32 s0, s12, s0 ; GFX1064-NEXT: s_addc_u32 s12, 0, s13 ; GFX1064-NEXT: s_add_u32 s0, s0, s11 -; GFX1064-NEXT: s_mul_i32 s1, s8, s1 +; GFX1064-NEXT: s_mul_i32 s1, s2, s1 ; GFX1064-NEXT: s_addc_u32 s0, s12, s14 ; GFX1064-NEXT: s_addc_u32 s11, s15, 0 ; GFX1064-NEXT: s_add_u32 s0, s0, s1 ; GFX1064-NEXT: s_addc_u32 s11, 0, s11 ; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_addc_u32 s8, s8, s11 +; GFX1064-NEXT: s_addc_u32 s2, s2, s11 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s11, s9, s0 +; GFX1064-NEXT: s_mul_i32 s1, s3, s2 +; GFX1064-NEXT: s_mul_hi_u32 s11, s3, s0 ; GFX1064-NEXT: s_mul_i32 s10, s10, s0 ; GFX1064-NEXT: s_add_i32 s1, s11, s1 -; GFX1064-NEXT: s_mul_i32 s9, s9, s0 +; GFX1064-NEXT: s_mul_i32 s3, s3, s0 ; GFX1064-NEXT: s_add_i32 s1, s1, s10 -; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s9 -; GFX1064-NEXT: s_mul_i32 s12, s8, s9 -; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s9 +; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s3 +; GFX1064-NEXT: s_mul_i32 s12, s2, s3 +; GFX1064-NEXT: s_mul_hi_u32 s3, s0, s3 ; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 ; GFX1064-NEXT: s_mul_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s1 -; GFX1064-NEXT: s_add_u32 s0, s9, s0 -; GFX1064-NEXT: s_addc_u32 s9, 0, s13 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s1 +; GFX1064-NEXT: s_add_u32 s0, s3, s0 +; GFX1064-NEXT: s_addc_u32 s3, 0, s13 ; GFX1064-NEXT: s_add_u32 s0, s0, s12 -; GFX1064-NEXT: s_mul_i32 s1, s8, s1 -; GFX1064-NEXT: s_addc_u32 s0, s9, s11 -; GFX1064-NEXT: s_addc_u32 s9, s10, 0 +; GFX1064-NEXT: s_mul_i32 s1, s2, s1 +; GFX1064-NEXT: s_addc_u32 s0, s3, s11 +; GFX1064-NEXT: s_addc_u32 s3, s10, 0 ; GFX1064-NEXT: s_add_u32 s0, s0, s1 -; GFX1064-NEXT: s_addc_u32 s9, 0, s9 +; GFX1064-NEXT: s_addc_u32 s3, 0, s3 ; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_addc_u32 s0, s8, s9 +; GFX1064-NEXT: s_addc_u32 s0, s2, s3 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1064-NEXT: s_mul_i32 s9, s6, s0 -; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s0 +; GFX1064-NEXT: s_mul_i32 s3, s6, s0 +; GFX1064-NEXT: s_mul_hi_u32 s2, s6, s0 ; GFX1064-NEXT: s_mul_hi_u32 s10, s7, s0 ; GFX1064-NEXT: s_mul_i32 s0, s7, s0 ; GFX1064-NEXT: s_mul_hi_u32 s11, s6, s1 ; GFX1064-NEXT: s_mul_hi_u32 s12, s7, s1 ; GFX1064-NEXT: s_mul_i32 s1, s7, s1 -; GFX1064-NEXT: s_add_u32 s9, s11, s9 -; GFX1064-NEXT: s_addc_u32 s8, 0, s8 -; GFX1064-NEXT: s_add_u32 s1, s9, s1 -; GFX1064-NEXT: s_addc_u32 s1, s8, s12 -; GFX1064-NEXT: s_addc_u32 s8, s10, 0 +; GFX1064-NEXT: s_add_u32 s3, s11, s3 +; GFX1064-NEXT: s_addc_u32 s2, 0, s2 +; GFX1064-NEXT: s_add_u32 s1, s3, s1 +; GFX1064-NEXT: s_addc_u32 s1, s2, s12 +; GFX1064-NEXT: s_addc_u32 s2, s10, 0 ; GFX1064-NEXT: s_add_u32 s10, s1, s0 -; GFX1064-NEXT: s_addc_u32 s11, 0, s8 +; GFX1064-NEXT: s_addc_u32 s11, 0, s2 ; GFX1064-NEXT: s_mul_hi_u32 s0, s4, s10 ; GFX1064-NEXT: s_mul_i32 s1, s4, s11 -; GFX1064-NEXT: s_mul_i32 s9, s4, s10 +; GFX1064-NEXT: s_mul_i32 s3, s4, s10 ; GFX1064-NEXT: s_add_i32 s12, s0, s1 -; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9 -; GFX1064-NEXT: s_mul_i32 s8, s5, s10 -; GFX1064-NEXT: s_add_i32 s12, s12, s8 -; GFX1064-NEXT: v_sub_co_u32 v1, s[8:9], v0, s4 +; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s3 +; GFX1064-NEXT: s_mul_i32 s2, s5, s10 +; GFX1064-NEXT: s_add_i32 s12, s12, s2 +; GFX1064-NEXT: v_sub_co_u32 v1, s[2:3], v0, s4 ; GFX1064-NEXT: s_sub_i32 s13, s7, s12 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_subb_u32 s13, s13, s5 -; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; GFX1064-NEXT: s_subb_u32 s8, s13, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s8, s5 +; GFX1064-NEXT: s_subb_u32 s2, s13, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s2, s5 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s8, s5 +; GFX1064-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s2, s5 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1064-NEXT: s_add_u32 s8, s10, 1 -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc -; GFX1064-NEXT: s_addc_u32 s9, s11, 0 +; GFX1064-NEXT: s_add_u32 s2, s10, 1 +; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc +; GFX1064-NEXT: s_addc_u32 s3, s11, 0 ; GFX1064-NEXT: s_add_u32 s13, s10, 2 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s14 ; GFX1064-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc +; GFX1064-NEXT: v_cndmask_b32_e32 v2, s2, v2, vcc +; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc ; GFX1064-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc @@ -1031,21 +1031,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_add_i32 s0, s0, s1 ; GFX1064-NEXT: s_mul_hi_u32 s0, s6, s0 ; GFX1064-NEXT: s_mul_i32 s1, s0, s4 -; GFX1064-NEXT: s_add_i32 s5, s0, 1 +; GFX1064-NEXT: s_add_i32 s2, s0, 1 ; GFX1064-NEXT: s_sub_i32 s1, s6, s1 -; GFX1064-NEXT: s_sub_i32 s6, s1, s4 +; GFX1064-NEXT: s_sub_i32 s3, s1, s4 ; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 -; GFX1064-NEXT: s_cselect_b32 s1, s6, s1 -; GFX1064-NEXT: s_add_i32 s5, s0, 1 +; GFX1064-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1064-NEXT: s_cselect_b32 s1, s3, s1 +; GFX1064-NEXT: s_add_i32 s2, s0, 1 ; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 ; GFX1064-NEXT: s_mov_b32 s1, 0 -; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 +; GFX1064-NEXT: s_cselect_b32 s0, s2, s0 ; GFX1064-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s1 ; GFX1064-NEXT: .LBB15_3: ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] offset:16 ; GFX1064-NEXT: s_endpgm ; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1063,30 +1063,30 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX1032-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX1032-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_div_scale_f32 v1, s0, v2, v2, v1 +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX1064-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX1064-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], v2, v2, v1 +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -1263,9 +1263,10 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1274,7 +1275,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX1032-NEXT: global_load_dword v0, v0, s[8:9] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo @@ -1703,30 +1704,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s6 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) store i64 %tmp, ptr addrspace(1) %out @@ -2353,42 +2354,42 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0 -; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1032-NEXT: v_cmp_neq_f64_e64 s2, s[0:1], 1.0 +; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3] +; GFX1032-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1] ; GFX1032-NEXT: s_branch .LBB47_3 ; GFX1032-NEXT: .LBB47_2: -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: .LBB47_3: ; %endif ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0 -; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX1064-NEXT: v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0 +; GFX1064-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3] +; GFX1064-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1] ; GFX1064-NEXT: s_branch .LBB47_3 ; GFX1064-NEXT: .LBB47_2: -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 ; GFX1064-NEXT: .LBB47_3: ; %endif ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm entry: %v = load double, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll index f9137b075e462..af50e09f509a3 100644 --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -53,7 +53,7 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}} ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}} -; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]] +; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]] ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_short [[RESULT]]