Skip to content

Commit a6ee0ad

Browse files
[llvm-mca][AMDGPU] Retire instructions that have issue carry over correctly (#83881)
#83775 shows llvm-mca hits sanitizer error in cycleEnd. There was an instruction that takes multiple cycles to issue and is finished executing directly after issue. Prior to this patch, the instruction is retired on the first issue cycle, despite taking multiple cycles to issue. To fix this, if an instruction takes multiple cycles to issue and is done executing after issue, let updateCarriedOver retire the instruction when it is fully issued.
1 parent df267fe commit a6ee0ad

File tree

2 files changed

+113
-10
lines changed

2 files changed

+113
-10
lines changed

llvm/lib/MCA/Stages/InOrderIssueStage.cpp

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -257,13 +257,13 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR) {
257257
}
258258

259259
// If the instruction has a latency of 0, we need to handle
260-
// the execution and retirement now.
261-
if (IS.isExecuted()) {
260+
// the execution and retirement now. If the instruction is issued in multiple
261+
// cycles, we cannot handle the instruction being executed here so we make
262+
// updateCarriedOver responsible.
263+
if (IS.isExecuted() && !ShouldCarryOver) {
262264
PRF.onInstructionExecuted(&IS);
263265
LSU.onInstructionExecuted(IR);
264-
notifyEvent<HWInstructionEvent>(
265-
HWInstructionEvent(HWInstructionEvent::Executed, IR));
266-
LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n");
266+
notifyInstructionExecuted(IR);
267267

268268
retireInstruction(IR);
269269
return llvm::ErrorSuccess();
@@ -294,12 +294,18 @@ void InOrderIssueStage::updateIssuedInst() {
294294
continue;
295295
}
296296

297-
PRF.onInstructionExecuted(&IS);
298-
LSU.onInstructionExecuted(IR);
299-
notifyInstructionExecuted(IR);
300-
++NumExecuted;
297+
// If the instruction takes multiple cycles to issue, defer these calls
298+
// to updateCarriedOver. We still remove from IssuedInst even if there is
299+
// carry over to avoid an extra call to cycleEvent in the next cycle.
300+
if (!CarriedOver) {
301+
PRF.onInstructionExecuted(&IS);
302+
LSU.onInstructionExecuted(IR);
303+
notifyInstructionExecuted(IR);
301304

302-
retireInstruction(*I);
305+
retireInstruction(*I);
306+
}
307+
308+
++NumExecuted;
303309

304310
std::iter_swap(I, E - NumExecuted);
305311
}
@@ -329,6 +335,16 @@ void InOrderIssueStage::updateCarriedOver() {
329335
else
330336
Bandwidth -= CarryOver;
331337

338+
// updateIssuedInst defered these calls to updateCarriedOver when there was
339+
// a carry over.
340+
if (CarriedOver.getInstruction()->isExecuted()) {
341+
PRF.onInstructionExecuted(CarriedOver.getInstruction());
342+
LSU.onInstructionExecuted(CarriedOver);
343+
notifyInstructionExecuted(CarriedOver);
344+
345+
retireInstruction(CarriedOver);
346+
}
347+
332348
CarriedOver = InstRef();
333349
CarryOver = 0;
334350
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx940 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
3+
4+
v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
5+
v_pk_add_f32 v[0:1], v[0:1], v[0:1]
6+
v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
7+
v_add_co_u32 v5, s[0:1], v1, v2
8+
v_sub_co_u32 v5, s[0:1], v1, v2
9+
v_add_u32 v5, v1, v2
10+
v_sub_u32 v5, v1, v2
11+
12+
# CHECK: Iterations: 1
13+
# CHECK-NEXT: Instructions: 7
14+
# CHECK-NEXT: Total Cycles: 10
15+
# CHECK-NEXT: Total uOps: 9
16+
17+
# CHECK: Dispatch Width: 1
18+
# CHECK-NEXT: uOps Per Cycle: 0.90
19+
# CHECK-NEXT: IPC: 0.70
20+
# CHECK-NEXT: Block RThroughput: 9.0
21+
22+
# CHECK: Instruction Info:
23+
# CHECK-NEXT: [1]: #uOps
24+
# CHECK-NEXT: [2]: Latency
25+
# CHECK-NEXT: [3]: RThroughput
26+
# CHECK-NEXT: [4]: MayLoad
27+
# CHECK-NEXT: [5]: MayStore
28+
# CHECK-NEXT: [6]: HasSideEffects (U)
29+
30+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
31+
# CHECK-NEXT: 1 1 1.00 U v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
32+
# CHECK-NEXT: 1 1 1.00 U v_pk_add_f32 v[0:1], v[0:1], v[0:1]
33+
# CHECK-NEXT: 1 1 1.00 U v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
34+
# CHECK-NEXT: 2 1 1.00 U v_add_co_u32_e64 v5, s[0:1], v1, v2
35+
# CHECK-NEXT: 2 1 1.00 U v_sub_co_u32_e64 v5, s[0:1], v1, v2
36+
# CHECK-NEXT: 1 1 1.00 U v_add_u32_e32 v5, v1, v2
37+
# CHECK-NEXT: 1 1 1.00 U v_sub_u32_e32 v5, v1, v2
38+
39+
# CHECK: Resources:
40+
# CHECK-NEXT: [0] - HWBranch
41+
# CHECK-NEXT: [1] - HWExport
42+
# CHECK-NEXT: [2] - HWLGKM
43+
# CHECK-NEXT: [3] - HWSALU
44+
# CHECK-NEXT: [4] - HWVALU
45+
# CHECK-NEXT: [5] - HWVMEM
46+
# CHECK-NEXT: [6] - HWXDL
47+
48+
# CHECK: Resource pressure per iteration:
49+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
50+
# CHECK-NEXT: - - - 2.00 7.00 - -
51+
52+
# CHECK: Resource pressure by instruction:
53+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
54+
# CHECK-NEXT: - - - - 1.00 - - v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
55+
# CHECK-NEXT: - - - - 1.00 - - v_pk_add_f32 v[0:1], v[0:1], v[0:1]
56+
# CHECK-NEXT: - - - - 1.00 - - v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
57+
# CHECK-NEXT: - - - 1.00 1.00 - - v_add_co_u32_e64 v5, s[0:1], v1, v2
58+
# CHECK-NEXT: - - - 1.00 1.00 - - v_sub_co_u32_e64 v5, s[0:1], v1, v2
59+
# CHECK-NEXT: - - - - 1.00 - - v_add_u32_e32 v5, v1, v2
60+
# CHECK-NEXT: - - - - 1.00 - - v_sub_u32_e32 v5, v1, v2
61+
62+
# CHECK: Timeline view:
63+
# CHECK-NEXT: Index 0123456789
64+
65+
# CHECK: [0,0] DE . . v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
66+
# CHECK-NEXT: [0,1] .DE . . v_pk_add_f32 v[0:1], v[0:1], v[0:1]
67+
# CHECK-NEXT: [0,2] . DE . . v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
68+
# CHECK-NEXT: [0,3] . DE. . v_add_co_u32_e64 v5, s[0:1], v1, v2
69+
# CHECK-NEXT: [0,4] . DeE . v_sub_co_u32_e64 v5, s[0:1], v1, v2
70+
# CHECK-NEXT: [0,5] . . DE. v_add_u32_e32 v5, v1, v2
71+
# CHECK-NEXT: [0,6] . . DE v_sub_u32_e32 v5, v1, v2
72+
73+
# CHECK: Average Wait times (based on the timeline view):
74+
# CHECK-NEXT: [0]: Executions
75+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
76+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
77+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
78+
79+
# CHECK: [0] [1] [2] [3]
80+
# CHECK-NEXT: 0. 1 0.0 0.0 0.0 v_pk_mov_b32 v[0:1], v[2:3], v[4:5]
81+
# CHECK-NEXT: 1. 1 0.0 0.0 0.0 v_pk_add_f32 v[0:1], v[0:1], v[0:1]
82+
# CHECK-NEXT: 2. 1 0.0 0.0 0.0 v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
83+
# CHECK-NEXT: 3. 1 0.0 0.0 0.0 v_add_co_u32_e64 v5, s[0:1], v1, v2
84+
# CHECK-NEXT: 4. 1 0.0 0.0 0.0 v_sub_co_u32_e64 v5, s[0:1], v1, v2
85+
# CHECK-NEXT: 5. 1 0.0 0.0 0.0 v_add_u32_e32 v5, v1, v2
86+
# CHECK-NEXT: 6. 1 0.0 0.0 0.0 v_sub_u32_e32 v5, v1, v2
87+
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>

0 commit comments

Comments
 (0)