Skip to content

Commit a47e36d

Browse files
kwachowsjlawryno
authored andcommitted
accel/ivpu: Trigger device recovery on engine reset/resume failure
Trigger full device recovery when the driver fails to restore device state via engine reset and resume operations. This is necessary because, even if submissions from a faulty context are blocked, the NPU may still process previously submitted faulty jobs if the engine reset fails to abort them. Such jobs can continue to generate faults and occupy device resources. When engine reset is ineffective, the only way to recover is to perform a full device recovery. Fixes: dad945c ("accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW") Cc: [email protected] # v6.15+ Signed-off-by: Karol Wachowski <[email protected]> Reviewed-by: Lizhi Hou <[email protected]> Signed-off-by: Jacek Lawrynowicz <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 98d3f77 commit a47e36d

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

drivers/accel/ivpu/ivpu_job.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -986,7 +986,8 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
986986
return;
987987

988988
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
989-
ivpu_jsm_reset_engine(vdev, 0);
989+
if (ivpu_jsm_reset_engine(vdev, 0))
990+
return;
990991

991992
mutex_lock(&vdev->context_list_lock);
992993
xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
@@ -1009,7 +1010,8 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
10091010
if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
10101011
goto runtime_put;
10111012

1012-
ivpu_jsm_hws_resume_engine(vdev, 0);
1013+
if (ivpu_jsm_hws_resume_engine(vdev, 0))
1014+
return;
10131015
/*
10141016
* In hardware scheduling mode NPU already has stopped processing jobs
10151017
* and won't send us any further notifications, thus we have to free job related resources

drivers/accel/ivpu/ivpu_jsm_msg.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "ivpu_hw.h"
88
#include "ivpu_ipc.h"
99
#include "ivpu_jsm_msg.h"
10+
#include "ivpu_pm.h"
1011
#include "vpu_jsm_api.h"
1112

1213
const char *ivpu_jsm_msg_type_to_str(enum vpu_ipc_msg_type type)
@@ -163,8 +164,10 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
163164

164165
ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
165166
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
166-
if (ret)
167+
if (ret) {
167168
ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
169+
ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
170+
}
168171

169172
return ret;
170173
}
@@ -354,8 +357,10 @@ int ivpu_jsm_hws_resume_engine(struct ivpu_device *vdev, u32 engine)
354357

355358
ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE, &resp,
356359
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
357-
if (ret)
360+
if (ret) {
358361
ivpu_err_ratelimited(vdev, "Failed to resume engine %d: %d\n", engine, ret);
362+
ivpu_pm_trigger_recovery(vdev, "Engine resume failed");
363+
}
359364

360365
return ret;
361366
}

0 commit comments

Comments
 (0)