Skip to content

Commit 142bc7d

Browse files
mthierryickle
authored andcommitted
drm/i915: Modify error handler for per engine hang recovery
This is a preparatory patch which modifies error handler to do per engine hang recovery. The actual patch which implements this sequence follows later in the series. The aim is to prepare existing recovery function to adapt to this new function where applicable (which fails at this point because core implementation is lacking) and continue recovery using legacy full gpu reset. A helper function is also added to query the availability of engine reset. A subsequent patch will add the capability to query which type of reset is present (engine -> full -> no-reset) via the get-param ioctl. It has been decided that the error events that are used to notify user of reset will only be sent in case if full chip reset. In case of just single (or multiple) engine resets, userspace won't be notified by these events. Note that this implementation of engine reset is for i915 directly submitting to the ELSP, where the driver manages the hang detection, recovery and resubmission. With GuC submission these tasks are shared between driver and firmware; i915 will still responsible for detecting a hang, and when it does it will have to request GuC to reset that Engine and remind the firmware about the outstanding submissions. This will be added in different patch. v2: rebase, advertise engine reset availability in platform definition, add note about GuC submission. v3: s/*engine_reset*/*reset_engine*/. (Chris) Handle reset as 2 level resets, by first going to engine only and fall backing to full/chip reset as needed, i.e. reset_engine will need the struct_mutex. v4: Pass the engine mask to i915_reset. (Chris) v5: Rebase, update selftests. v6: Rebase, prepare for mutex-less reset engine. v7: Pass reset_engine mask as a function parameter, and iterate over the engine mask for reset_engine. (Chris) v8: Use i915.reset >=2 in has_reset_engine; remove redundant reset logging; add a reset-engine-in-progress flag to prevent concurrent resets, and avoid dual purposing of reset-backoff. (Chris) v9: Support reset of different engines in parallel (Chris) v10: Handle reset-engine flag locking better (Chris) v11: Squash in reporting of per-engine-reset availability. Cc: Chris Wilson <[email protected]> Cc: Mika Kuoppala <[email protected]> Signed-off-by: Ian Lister <[email protected]> Signed-off-by: Tomas Elf <[email protected]> Signed-off-by: Arun Siluvery <[email protected]> Signed-off-by: Michel Thierry <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected] Reviewed-by: Chris Wilson <[email protected]> Signed-off-by: Chris Wilson <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent ed35dd7 commit 142bc7d

File tree

5 files changed

+78
-1
lines changed

5 files changed

+78
-1
lines changed

drivers/gpu/drm/i915/i915_drv.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,8 @@ static int i915_getparam(struct drm_device *dev, void *data,
331331
break;
332332
case I915_PARAM_HAS_GPU_RESET:
333333
value = i915.enable_hangcheck && intel_has_gpu_reset(dev_priv);
334+
if (value && intel_has_reset_engine(dev_priv))
335+
value = 2;
334336
break;
335337
case I915_PARAM_HAS_RESOURCE_STREAMER:
336338
value = HAS_RESOURCE_STREAMER(dev_priv);
@@ -1915,6 +1917,19 @@ void i915_reset(struct drm_i915_private *dev_priv)
19151917
goto finish;
19161918
}
19171919

1920+
/**
1921+
* i915_reset_engine - reset GPU engine to recover from a hang
1922+
* @engine: engine to reset
1923+
*
1924+
* Reset a specific GPU engine. Useful if a hang is detected.
1925+
* Returns zero on successful reset or otherwise an error code.
1926+
*/
1927+
int i915_reset_engine(struct intel_engine_cs *engine)
1928+
{
1929+
/* FIXME: replace me with engine reset sequence */
1930+
return -ENODEV;
1931+
}
1932+
19181933
static int i915_pm_suspend(struct device *kdev)
19191934
{
19201935
struct pci_dev *pdev = to_pci_dev(kdev);

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,7 @@ struct intel_csr {
752752
func(has_csr); \
753753
func(has_ddi); \
754754
func(has_dp_mst); \
755+
func(has_reset_engine); \
755756
func(has_fbc); \
756757
func(has_fpga_dbg); \
757758
func(has_full_ppgtt); \
@@ -1549,6 +1550,12 @@ struct i915_gpu_error {
15491550
* inspect the bit and do the reset directly, otherwise the worker
15501551
* waits for the struct_mutex.
15511552
*
1553+
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
1554+
* acquire the struct_mutex to reset an engine, we need an explicit
1555+
* flag to prevent two concurrent reset attempts in the same engine.
1556+
* As the number of engines continues to grow, allocate the flags from
1557+
* the most significant bits.
1558+
*
15521559
* #I915_WEDGED - If reset fails and we can no longer use the GPU,
15531560
* we set the #I915_WEDGED bit. Prior to command submission, e.g.
15541561
* i915_gem_request_alloc(), this bit is checked and the sequence
@@ -1558,6 +1565,7 @@ struct i915_gpu_error {
15581565
#define I915_RESET_BACKOFF 0
15591566
#define I915_RESET_HANDOFF 1
15601567
#define I915_WEDGED (BITS_PER_LONG - 1)
1568+
#define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
15611569

15621570
/**
15631571
* Waitqueue to signal when a hang is detected. Used to for waiters
@@ -3092,6 +3100,8 @@ extern void i915_driver_unload(struct drm_device *dev);
30923100
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
30933101
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
30943102
extern void i915_reset(struct drm_i915_private *dev_priv);
3103+
extern int i915_reset_engine(struct intel_engine_cs *engine);
3104+
extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
30953105
extern int intel_guc_reset(struct drm_i915_private *dev_priv);
30963106
extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
30973107
extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);

drivers/gpu/drm/i915/i915_irq.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2715,6 +2715,8 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
27152715
u32 engine_mask,
27162716
const char *fmt, ...)
27172717
{
2718+
struct intel_engine_cs *engine;
2719+
unsigned int tmp;
27182720
va_list args;
27192721
char error_msg[80];
27202722

@@ -2734,18 +2736,54 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
27342736
i915_capture_error_state(dev_priv, engine_mask, error_msg);
27352737
i915_clear_error_registers(dev_priv);
27362738

2739+
/*
2740+
* Try engine reset when available. We fall back to full reset if
2741+
* single reset fails.
2742+
*/
2743+
if (intel_has_reset_engine(dev_priv)) {
2744+
for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
2745+
BUILD_BUG_ON(I915_RESET_HANDOFF >= I915_RESET_ENGINE);
2746+
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
2747+
&dev_priv->gpu_error.flags))
2748+
continue;
2749+
2750+
if (i915_reset_engine(engine) == 0)
2751+
engine_mask &= ~intel_engine_flag(engine);
2752+
2753+
clear_bit(I915_RESET_ENGINE + engine->id,
2754+
&dev_priv->gpu_error.flags);
2755+
wake_up_bit(&dev_priv->gpu_error.flags,
2756+
I915_RESET_ENGINE + engine->id);
2757+
}
2758+
}
2759+
27372760
if (!engine_mask)
27382761
goto out;
27392762

2763+
/* Full reset needs the mutex, stop any other user trying to do so. */
27402764
if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) {
27412765
wait_event(dev_priv->gpu_error.reset_queue,
27422766
!test_bit(I915_RESET_BACKOFF,
27432767
&dev_priv->gpu_error.flags));
27442768
goto out;
27452769
}
27462770

2771+
/* Prevent any other reset-engine attempt. */
2772+
for_each_engine(engine, dev_priv, tmp) {
2773+
while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
2774+
&dev_priv->gpu_error.flags))
2775+
wait_on_bit(&dev_priv->gpu_error.flags,
2776+
I915_RESET_ENGINE + engine->id,
2777+
TASK_UNINTERRUPTIBLE);
2778+
}
2779+
27472780
i915_reset_device(dev_priv);
27482781

2782+
for_each_engine(engine, dev_priv, tmp) {
2783+
clear_bit(I915_RESET_ENGINE + engine->id,
2784+
&dev_priv->gpu_error.flags);
2785+
}
2786+
27492787
clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
27502788
wake_up_all(&dev_priv->gpu_error.reset_queue);
27512789

drivers/gpu/drm/i915/i915_pci.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,8 @@ static const struct intel_device_info intel_haswell_info = {
310310
BDW_COLORS, \
311311
.has_logical_ring_contexts = 1, \
312312
.has_full_48bit_ppgtt = 1, \
313-
.has_64bit_reloc = 1
313+
.has_64bit_reloc = 1, \
314+
.has_reset_engine = 1
314315

315316
#define BDW_PLATFORM \
316317
BDW_FEATURES, \
@@ -342,6 +343,7 @@ static const struct intel_device_info intel_cherryview_info = {
342343
.has_gmch_display = 1,
343344
.has_aliasing_ppgtt = 1,
344345
.has_full_ppgtt = 1,
346+
.has_reset_engine = 1,
345347
.display_mmio_offset = VLV_DISPLAY_BASE,
346348
GEN_CHV_PIPEOFFSETS,
347349
CURSOR_OFFSETS,
@@ -387,6 +389,7 @@ static const struct intel_device_info intel_skylake_gt3_info = {
387389
.has_aliasing_ppgtt = 1, \
388390
.has_full_ppgtt = 1, \
389391
.has_full_48bit_ppgtt = 1, \
392+
.has_reset_engine = 1, \
390393
GEN_DEFAULT_PIPEOFFSETS, \
391394
IVB_CURSOR_OFFSETS, \
392395
BDW_COLORS

drivers/gpu/drm/i915/intel_uncore.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1719,6 +1719,17 @@ bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
17191719
return intel_get_gpu_reset(dev_priv) != NULL;
17201720
}
17211721

1722+
/*
1723+
* When GuC submission is enabled, GuC manages ELSP and can initiate the
1724+
* engine reset too. For now, fall back to full GPU reset if it is enabled.
1725+
*/
1726+
bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
1727+
{
1728+
return (dev_priv->info.has_reset_engine &&
1729+
!dev_priv->guc.execbuf_client &&
1730+
i915.reset >= 2);
1731+
}
1732+
17221733
int intel_guc_reset(struct drm_i915_private *dev_priv)
17231734
{
17241735
int ret;

0 commit comments

Comments
 (0)