Skip to content

Commit 9a3f210

Browse files
committed
Merge tag 'drm-xe-fixes-2025-09-11' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
- Don't touch survivability_mode on fini (Michal) - Fixes around eviction and suspend (Thomas) - Extend Wa_13011645652 to PTL-H, WCL (Julia) Signed-off-by: Dave Airlie <[email protected]> From: Rodrigo Vivi <[email protected]> Link: https://lore.kernel.org/r/[email protected]
2 parents dab1f85 + fd99415 commit 9a3f210

File tree

13 files changed

+115
-29
lines changed

13 files changed

+115
-29
lines changed

drivers/gpu/drm/xe/tests/xe_bo.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc
236236
}
237237

238238
xe_bo_lock(external, false);
239-
err = xe_bo_pin_external(external);
239+
err = xe_bo_pin_external(external, false);
240240
xe_bo_unlock(external);
241241
if (err) {
242242
KUNIT_FAIL(test, "external bo pin err=%pe\n",

drivers/gpu/drm/xe/tests/xe_dma_buf.c

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported,
8989
return;
9090
}
9191

92-
/*
93-
* If on different devices, the exporter is kept in system if
94-
* possible, saving a migration step as the transfer is just
95-
* likely as fast from system memory.
96-
*/
97-
if (params->mem_mask & XE_BO_FLAG_SYSTEM)
98-
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT));
99-
else
100-
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
92+
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
10193

10294
if (params->force_different_devices)
10395
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT));

drivers/gpu/drm/xe/xe_bo.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo,
186186

187187
bo->placements[*c] = (struct ttm_place) {
188188
.mem_type = XE_PL_TT,
189+
.flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ?
190+
TTM_PL_FLAG_FALLBACK : 0,
189191
};
190192
*c += 1;
191193
}
@@ -2269,14 +2271,15 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
22692271
/**
22702272
* xe_bo_pin_external - pin an external BO
22712273
* @bo: buffer object to be pinned
2274+
* @in_place: Pin in current placement, don't attempt to migrate.
22722275
*
22732276
* Pin an external (not tied to a VM, can be exported via dma-buf / prime FD)
22742277
* BO. Unique call compared to xe_bo_pin as this function has it own set of
22752278
* asserts and code to ensure evict / restore on suspend / resume.
22762279
*
22772280
* Returns 0 for success, negative error code otherwise.
22782281
*/
2279-
int xe_bo_pin_external(struct xe_bo *bo)
2282+
int xe_bo_pin_external(struct xe_bo *bo, bool in_place)
22802283
{
22812284
struct xe_device *xe = xe_bo_device(bo);
22822285
int err;
@@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo)
22852288
xe_assert(xe, xe_bo_is_user(bo));
22862289

22872290
if (!xe_bo_is_pinned(bo)) {
2288-
err = xe_bo_validate(bo, NULL, false);
2289-
if (err)
2290-
return err;
2291+
if (!in_place) {
2292+
err = xe_bo_validate(bo, NULL, false);
2293+
if (err)
2294+
return err;
2295+
}
22912296

22922297
spin_lock(&xe->pinned.lock);
22932298
list_add_tail(&bo->pinned_link, &xe->pinned.late.external);
@@ -2440,6 +2445,9 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict)
24402445
};
24412446
int ret;
24422447

2448+
if (xe_bo_is_pinned(bo))
2449+
return 0;
2450+
24432451
if (vm) {
24442452
lockdep_assert_held(&vm->lock);
24452453
xe_vm_assert_held(vm);

drivers/gpu/drm/xe/xe_bo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo)
198198
}
199199
}
200200

201-
int xe_bo_pin_external(struct xe_bo *bo);
201+
int xe_bo_pin_external(struct xe_bo *bo, bool in_place);
202202
int xe_bo_pin(struct xe_bo *bo);
203203
void xe_bo_unpin_external(struct xe_bo *bo);
204204
void xe_bo_unpin(struct xe_bo *bo);

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,12 @@ struct xe_device {
553553

554554
/** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */
555555
struct notifier_block pm_notifier;
556+
/** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */
557+
struct completion pm_block;
558+
/** @rebind_resume_list: List of wq items to kick on resume. */
559+
struct list_head rebind_resume_list;
560+
/** @rebind_resume_lock: Lock to protect the rebind_resume_list */
561+
struct mutex rebind_resume_lock;
556562

557563
/** @pmt: Support the PMT driver callback interface */
558564
struct {

drivers/gpu/drm/xe/xe_dma_buf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach)
7272
return ret;
7373
}
7474

75-
ret = xe_bo_pin_external(bo);
75+
ret = xe_bo_pin_external(bo, true);
7676
xe_assert(xe, !ret);
7777

7878
return 0;

drivers/gpu/drm/xe/xe_exec.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
237237
goto err_unlock_list;
238238
}
239239

240+
/*
241+
* It's OK to block interruptible here with the vm lock held, since
242+
* on task freezing during suspend / hibernate, the call will
243+
* return -ERESTARTSYS and the IOCTL will be rerun.
244+
*/
245+
err = wait_for_completion_interruptible(&xe->pm_block);
246+
if (err)
247+
goto err_unlock_list;
248+
240249
vm_exec.vm = &vm->gpuvm;
241250
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
242251
if (xe_vm_in_lr_mode(vm)) {

drivers/gpu/drm/xe/xe_pm.c

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "xe_pcode.h"
2525
#include "xe_pxp.h"
2626
#include "xe_trace.h"
27+
#include "xe_vm.h"
2728
#include "xe_wa.h"
2829

2930
/**
@@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe)
290291
return DEFAULT_VRAM_THRESHOLD;
291292
}
292293

294+
static void xe_pm_wake_rebind_workers(struct xe_device *xe)
295+
{
296+
struct xe_vm *vm, *next;
297+
298+
mutex_lock(&xe->rebind_resume_lock);
299+
list_for_each_entry_safe(vm, next, &xe->rebind_resume_list,
300+
preempt.pm_activate_link) {
301+
list_del_init(&vm->preempt.pm_activate_link);
302+
xe_vm_resume_rebind_worker(vm);
303+
}
304+
mutex_unlock(&xe->rebind_resume_lock);
305+
}
306+
293307
static int xe_pm_notifier_callback(struct notifier_block *nb,
294308
unsigned long action, void *data)
295309
{
@@ -299,30 +313,30 @@ static int xe_pm_notifier_callback(struct notifier_block *nb,
299313
switch (action) {
300314
case PM_HIBERNATION_PREPARE:
301315
case PM_SUSPEND_PREPARE:
316+
reinit_completion(&xe->pm_block);
302317
xe_pm_runtime_get(xe);
303318
err = xe_bo_evict_all_user(xe);
304-
if (err) {
319+
if (err)
305320
drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err);
306-
xe_pm_runtime_put(xe);
307-
break;
308-
}
309321

310322
err = xe_bo_notifier_prepare_all_pinned(xe);
311-
if (err) {
323+
if (err)
312324
drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err);
313-
xe_pm_runtime_put(xe);
314-
}
325+
/*
326+
* Keep the runtime pm reference until post hibernation / post suspend to
327+
* avoid a runtime suspend interfering with evicted objects or backup
328+
* allocations.
329+
*/
315330
break;
316331
case PM_POST_HIBERNATION:
317332
case PM_POST_SUSPEND:
333+
complete_all(&xe->pm_block);
334+
xe_pm_wake_rebind_workers(xe);
318335
xe_bo_notifier_unprepare_all_pinned(xe);
319336
xe_pm_runtime_put(xe);
320337
break;
321338
}
322339

323-
if (err)
324-
return NOTIFY_BAD;
325-
326340
return NOTIFY_DONE;
327341
}
328342

@@ -344,6 +358,14 @@ int xe_pm_init(struct xe_device *xe)
344358
if (err)
345359
return err;
346360

361+
err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock);
362+
if (err)
363+
goto err_unregister;
364+
365+
init_completion(&xe->pm_block);
366+
complete_all(&xe->pm_block);
367+
INIT_LIST_HEAD(&xe->rebind_resume_list);
368+
347369
/* For now suspend/resume is only allowed with GuC */
348370
if (!xe_device_uc_enabled(xe))
349371
return 0;

drivers/gpu/drm/xe/xe_survivability_mode.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
*
4242
* # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
4343
*
44+
* It is the responsibility of the user to clear the mode once firmware flash is complete.
45+
*
4446
* Refer :ref:`xe_configfs` for more details on how to use configfs
4547
*
4648
* Survivability mode is indicated by the below admin-only readable sysfs which provides additional
@@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg)
147149
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
148150
struct device *dev = &pdev->dev;
149151

150-
xe_configfs_clear_survivability_mode(pdev);
151152
sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
152153
}
153154

drivers/gpu/drm/xe/xe_vm.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
393393
list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
394394
&vm->rebind_list);
395395

396+
if (!try_wait_for_completion(&vm->xe->pm_block))
397+
return -EAGAIN;
398+
396399
ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
397400
if (ret)
398401
return ret;
@@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
479482
return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
480483
}
481484

485+
static bool vm_suspend_rebind_worker(struct xe_vm *vm)
486+
{
487+
struct xe_device *xe = vm->xe;
488+
bool ret = false;
489+
490+
mutex_lock(&xe->rebind_resume_lock);
491+
if (!try_wait_for_completion(&vm->xe->pm_block)) {
492+
ret = true;
493+
list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
494+
}
495+
mutex_unlock(&xe->rebind_resume_lock);
496+
497+
return ret;
498+
}
499+
500+
/**
501+
* xe_vm_resume_rebind_worker() - Resume the rebind worker.
502+
* @vm: The vm whose preempt worker to resume.
503+
*
504+
* Resume a preempt worker that was previously suspended by
505+
* vm_suspend_rebind_worker().
506+
*/
507+
void xe_vm_resume_rebind_worker(struct xe_vm *vm)
508+
{
509+
queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
510+
}
511+
482512
static void preempt_rebind_work_func(struct work_struct *w)
483513
{
484514
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w)
502532
}
503533

504534
retry:
535+
if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
536+
up_write(&vm->lock);
537+
return;
538+
}
539+
505540
if (xe_vm_userptr_check_repin(vm)) {
506541
err = xe_vm_userptr_pin(vm);
507542
if (err)
@@ -1714,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
17141749
if (flags & XE_VM_FLAG_LR_MODE) {
17151750
INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
17161751
xe_pm_runtime_get_noresume(xe);
1752+
INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
17171753
}
17181754

17191755
if (flags & XE_VM_FLAG_FAULT_MODE) {
@@ -1895,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm)
18951931
xe_assert(xe, !vm->preempt.num_exec_queues);
18961932

18971933
xe_vm_close(vm);
1898-
if (xe_vm_in_preempt_fence_mode(vm))
1934+
if (xe_vm_in_preempt_fence_mode(vm)) {
1935+
mutex_lock(&xe->rebind_resume_lock);
1936+
list_del_init(&vm->preempt.pm_activate_link);
1937+
mutex_unlock(&xe->rebind_resume_lock);
18991938
flush_work(&vm->preempt.rebind_work);
1939+
}
19001940
if (xe_vm_in_fault_mode(vm))
19011941
xe_svm_close(vm);
19021942

0 commit comments

Comments
 (0)