Skip to content

Commit b4b05e5

Browse files
committed
drm/xe/guc_pc: Retry and wait longer for GuC PC start
In a rare situation of thermal limit during resume, GuC can be slow and run into delays like this: xe 0000:00:02.0: [drm] GT1: excessive init time: 667ms! \ [status = 0x8002F034, timeouts = 0] xe 0000:00:02.0: [drm] GT1: excessive init time: \ [freq = 100MHz (req = 800MHz), before = 100MHz, \ perf_limit_reasons = 0x1C001000] xe 0000:00:02.0: [drm] *ERROR* GT1: GuC PC Start failed ------------[ cut here ]------------ xe 0000:00:02.0: [drm] GT1: Failed to start GuC PC: -EIO When this happens, it will block entirely the GPU to be used. So, let's try and with a huge timeout in the hope it comes back. Also, let's collect some information on how long it is usually taking on situations like this, so perhaps the time can be tuned later. Cc: Vinay Belgaumkar <[email protected]> Cc: Jonathan Cavitt <[email protected]> Cc: John Harrison <[email protected]> Reviewed-by: Jonathan Cavitt <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] Signed-off-by: Rodrigo Vivi <[email protected]>
1 parent c36e344 commit b4b05e5

File tree

1 file changed

+40
-13
lines changed

1 file changed

+40
-13
lines changed

drivers/gpu/drm/xe/xe_guc_pc.c

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "xe_guc_pc.h"
77

88
#include <linux/delay.h>
9+
#include <linux/ktime.h>
910

1011
#include <drm/drm_managed.h>
1112
#include <drm/drm_print.h>
@@ -20,6 +21,7 @@
2021
#include "xe_gt.h"
2122
#include "xe_gt_idle.h"
2223
#include "xe_gt_printk.h"
24+
#include "xe_gt_throttle.h"
2325
#include "xe_gt_types.h"
2426
#include "xe_guc.h"
2527
#include "xe_guc_ct.h"
@@ -50,6 +52,9 @@
5052
#define LNL_MERT_FREQ_CAP 800
5153
#define BMG_MERT_FREQ_CAP 2133
5254

55+
#define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */
56+
#define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */
57+
5358
/**
5459
* DOC: GuC Power Conservation (PC)
5560
*
@@ -114,9 +119,10 @@ static struct iosys_map *pc_to_maps(struct xe_guc_pc *pc)
114119
FIELD_PREP(HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ARGC, count))
115120

116121
static int wait_for_pc_state(struct xe_guc_pc *pc,
117-
enum slpc_global_state state)
122+
enum slpc_global_state state,
123+
int timeout_ms)
118124
{
119-
int timeout_us = 5000; /* rought 5ms, but no need for precision */
125+
int timeout_us = 1000 * timeout_ms;
120126
int slept, wait = 10;
121127

122128
xe_device_assert_mem_access(pc_to_xe(pc));
@@ -165,7 +171,8 @@ static int pc_action_query_task_state(struct xe_guc_pc *pc)
165171
};
166172
int ret;
167173

168-
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING))
174+
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
175+
SLPC_RESET_TIMEOUT_MS))
169176
return -EAGAIN;
170177

171178
/* Blocking here to ensure the results are ready before reading them */
@@ -188,7 +195,8 @@ static int pc_action_set_param(struct xe_guc_pc *pc, u8 id, u32 value)
188195
};
189196
int ret;
190197

191-
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING))
198+
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
199+
SLPC_RESET_TIMEOUT_MS))
192200
return -EAGAIN;
193201

194202
ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
@@ -209,7 +217,8 @@ static int pc_action_unset_param(struct xe_guc_pc *pc, u8 id)
209217
struct xe_guc_ct *ct = &pc_to_guc(pc)->ct;
210218
int ret;
211219

212-
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING))
220+
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
221+
SLPC_RESET_TIMEOUT_MS))
213222
return -EAGAIN;
214223

215224
ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
@@ -443,6 +452,15 @@ u32 xe_guc_pc_get_act_freq(struct xe_guc_pc *pc)
443452
return freq;
444453
}
445454

455+
static u32 get_cur_freq(struct xe_gt *gt)
456+
{
457+
u32 freq;
458+
459+
freq = xe_mmio_read32(&gt->mmio, RPNSWREQ);
460+
freq = REG_FIELD_GET(REQ_RATIO_MASK, freq);
461+
return decode_freq(freq);
462+
}
463+
446464
/**
447465
* xe_guc_pc_get_cur_freq - Get Current requested frequency
448466
* @pc: The GuC PC
@@ -466,10 +484,7 @@ int xe_guc_pc_get_cur_freq(struct xe_guc_pc *pc, u32 *freq)
466484
return -ETIMEDOUT;
467485
}
468486

469-
*freq = xe_mmio_read32(&gt->mmio, RPNSWREQ);
470-
471-
*freq = REG_FIELD_GET(REQ_RATIO_MASK, *freq);
472-
*freq = decode_freq(*freq);
487+
*freq = get_cur_freq(gt);
473488

474489
xe_force_wake_put(gt_to_fw(gt), fw_ref);
475490
return 0;
@@ -1016,6 +1031,7 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
10161031
struct xe_gt *gt = pc_to_gt(pc);
10171032
u32 size = PAGE_ALIGN(sizeof(struct slpc_shared_data));
10181033
unsigned int fw_ref;
1034+
ktime_t earlier;
10191035
int ret;
10201036

10211037
xe_gt_assert(gt, xe_device_uc_enabled(xe));
@@ -1040,14 +1056,25 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
10401056
memset(pc->bo->vmap.vaddr, 0, size);
10411057
slpc_shared_data_write(pc, header.size, size);
10421058

1059+
earlier = ktime_get();
10431060
ret = pc_action_reset(pc);
10441061
if (ret)
10451062
goto out;
10461063

1047-
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) {
1048-
xe_gt_err(gt, "GuC PC Start failed\n");
1049-
ret = -EIO;
1050-
goto out;
1064+
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
1065+
SLPC_RESET_TIMEOUT_MS)) {
1066+
xe_gt_warn(gt, "GuC PC start taking longer than normal [freq = %dMHz (req = %dMHz), perf_limit_reasons = 0x%08X]\n",
1067+
xe_guc_pc_get_act_freq(pc), get_cur_freq(gt),
1068+
xe_gt_throttle_get_limit_reasons(gt));
1069+
1070+
if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING,
1071+
SLPC_RESET_EXTENDED_TIMEOUT_MS)) {
1072+
xe_gt_err(gt, "GuC PC Start failed: Dynamic GT frequency control and GT sleep states are now disabled.\n");
1073+
goto out;
1074+
}
1075+
1076+
xe_gt_warn(gt, "GuC PC excessive start time: %lldms",
1077+
ktime_ms_delta(ktime_get(), earlier));
10511078
}
10521079

10531080
ret = pc_init_freqs(pc);

0 commit comments

Comments
 (0)