-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Use larger immediate values in S_NOP #158990
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The S_NOP instruction has an immediate operand which is one less than the number of cycles to delay for. The maximum value that may be encoded in this field was increased in GFX8 and again in GFX12.
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesThe S_NOP instruction has an immediate operand which is one less than Patch is 593.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158990.diff 32 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b94ea3ffbf1f..771e13d37e5e2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -688,6 +688,14 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
return NSAThreshold;
}
+unsigned GCNSubtarget::getSNopBits() const {
+ if (getGeneration() >= AMDGPUSubtarget::GFX12)
+ return 7;
+ if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 4;
+ return 3;
+}
+
GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
const GCNSubtarget &ST)
: ST(ST) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cbd6f64976d21..00339db9d990e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1839,6 +1839,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// \returns true if the subtarget requires a wait for xcnt before atomic
/// flat/global stores & rmw.
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+
+ /// \returns the number of significant bits in the immediate field of the
+ /// S_NOP instruction.
+ unsigned getSNopBits() const;
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5106478a95b43..ee3e8dc58b468 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned Quantity) const {
DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned MaxSNopCount = 1u << ST.getSNopBits();
while (Quantity > 0) {
- unsigned Arg = std::min(Quantity, 8u);
+ unsigned Arg = std::min(Quantity, MaxSNopCount);
Quantity -= Arg;
BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 393a462954003..5720b882f4e73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
@@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 1
+; GCN-NEXT: s_nop 9
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 1
+; GCN-NEXT: s_nop 9
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
bb:
@@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
@@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 002ccd6060681..635d2a2d16a76 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-NOT: v_accvgpr_write
; GCN: v_mfma_f32_32x32x1f32
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
@@ -28,8 +27,7 @@ bb:
; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-NOT: v_accvgpr_read
; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
@@ -80,8 +78,7 @@ bb:
; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
; GCN-COUNT-32: v_accvgpr_write
; GCN: v_mfma_f32_32x32x1f32
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 2
; GCN-NOT: v_accvgpr_read
; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index c226dae3d64a9..9e240238c1066 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX90A-NEXT: ; copy
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a3 v[0:31]
@@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a16, v39
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse
@@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
index 22c913496b734..b5c3e3214f125 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir
@@ -63,52 +63,41 @@ body: |
; GCN16-NEXT: successors: %bb.1(0x80000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_BRANCH %bb.1
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.1:
; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.2:
; GCN16-NEXT: successors: %bb.3(0x80000000)
; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_NOP 0
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_BRANCH %bb.3
; GCN16-NEXT: {{ $}}
; GCN16-NEXT: bb.3:
; GCN16-NEXT: liveins: $sgpr10_sgpr11
; GCN16-NEXT: {{ $}}
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
- ; GCN16-NEXT: S_NOP 7
- ; GCN16-NEXT: S_NOP 7
+ ; GCN16-NEXT: S_NOP 15
; GCN16-NEXT: SI_RETURN
bb.0:
liveins: $sgpr6, $sgpr10_sgpr11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 303ea50dc16cc..12a998ad82cd2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v0, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
@@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
@@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
@@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: s_nop 9
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mov_b32_e32 v1, 2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
@@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
@@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: s_nop 9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
@@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index ff77d5ccbe312..5ab8706f28f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -59,8 +59,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@@ -117,8 +116,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
@@ -175,8 +173,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT: s_nop 7
-; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 2
; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@@ -233,8 +230,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 2
; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
@@ -283,8 +279,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
@@ -319,8 +314,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 2
+; GFX942-NEXT: ...
[truncated]
|
|
The first commit just contains some auto-check-regerenation. If you're skimming the test diffs you may prefer to look at the second commit only. |
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should include a machine verifier check the correct number of bits are used
| unsigned GCNSubtarget::getSNopBits() const { | ||
| if (getGeneration() >= AMDGPUSubtarget::GFX12) | ||
| return 7; | ||
| if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||
| return 4; | ||
| return 3; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull definition into header?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
That would break I guess that should either be disallowed, or it should generate multiple s_nop instructions, but I didn't want to get into changing the behaviour of an intrinsic in this patch. |
The S_NOP instruction has an immediate operand which is one less than
the number of cycles to delay for. The maximum value that may be encoded
in this field was increased in GFX8 and again in GFX12.