Skip to content

Commit a1ef70e

Browse files
mthierryickle
authored andcommitted
drm/i915: Add support for per engine reset recovery
This change implements support for per-engine reset as an initial, less intrusive hang recovery option to be attempted before falling back to the legacy full GPU reset recovery mode if necessary. This is only supported from Gen8 onwards. Hangchecker determines which engines are hung and invokes error handler to recover from it. Error handler schedules recovery for each of those engines that are hung. The recovery procedure is as follows, - identifies the request that caused the hang and it is dropped - force engine to idle: this is done by issuing a reset request - reset the engine - re-init the engine to resume submissions. If engine reset fails then we fall back to heavy weight full gpu reset which resets all engines and reinitiazes complete state of HW and SW. v2: Rebase. v3: s/*engine_reset*/*reset_engine*/; freeze engine and irqs before calling i915_gem_reset_engine (Chris). v4: Rebase, modify i915_gem_reset_prepare to use a ring mask and reuse the function for reset_engine. v5: intel_reset_engine_start/cancel instead of request/unrequest_reset. v6: Clean up reset_engine function to not require mutex, i.e. no need to call revoke/restore_fences and _retire_requests (Chris). v7: Remove leftovers from v5, i.e. no need to disable irq, hold forcewake or wakeup the handoff bit (Chris). v8: engine_retire_requests should be (and it was) static; explain that we have to re-init the engine after reset, which is why the init_hw call is needed; check reset-in-progress flag (Chris). v9: Rebase, include code to pass the active request to gem_reset_engine (as it is already done in full reset). Remove unnecessary intel_reset_engine_start/cancel, these are executed as part of the reset. v10: Rebase, use the right I915_RESET_ENGINE flag. v11: Fixup to call reset_finish_engine even on error. Cc: Chris Wilson <[email protected]> Cc: Mika Kuoppala <[email protected]> Signed-off-by: Tomas Elf <[email protected]> Signed-off-by: Arun Siluvery <[email protected]> Signed-off-by: Michel Thierry <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected] Reviewed-by: Chris Wilson <[email protected]> Signed-off-by: Chris Wilson <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 142bc7d commit a1ef70e

File tree

3 files changed

+110
-38
lines changed

3 files changed

+110
-38
lines changed

drivers/gpu/drm/i915/i915_drv.c

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1923,11 +1923,57 @@ void i915_reset(struct drm_i915_private *dev_priv)
19231923
*
19241924
* Reset a specific GPU engine. Useful if a hang is detected.
19251925
* Returns zero on successful reset or otherwise an error code.
1926+
*
1927+
* Procedure is:
1928+
* - identifies the request that caused the hang and it is dropped
1929+
* - reset engine (which will force the engine to idle)
1930+
* - re-init/configure engine
19261931
*/
19271932
int i915_reset_engine(struct intel_engine_cs *engine)
19281933
{
1929-
/* FIXME: replace me with engine reset sequence */
1930-
return -ENODEV;
1934+
struct i915_gpu_error *error = &engine->i915->gpu_error;
1935+
struct drm_i915_gem_request *active_request;
1936+
int ret;
1937+
1938+
GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
1939+
1940+
DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
1941+
1942+
active_request = i915_gem_reset_prepare_engine(engine);
1943+
if (IS_ERR(active_request)) {
1944+
DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
1945+
ret = PTR_ERR(active_request);
1946+
goto out;
1947+
}
1948+
1949+
/*
1950+
* The request that caused the hang is stuck on elsp, we know the
1951+
* active request and can drop it, adjust head to skip the offending
1952+
* request to resume executing remaining requests in the queue.
1953+
*/
1954+
i915_gem_reset_engine(engine, active_request);
1955+
1956+
/* Finally, reset just this engine. */
1957+
ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine));
1958+
1959+
i915_gem_reset_finish_engine(engine);
1960+
1961+
if (ret) {
1962+
/* If we fail here, we expect to fallback to a global reset */
1963+
DRM_DEBUG_DRIVER("Failed to reset %s, ret=%d\n",
1964+
engine->name, ret);
1965+
goto out;
1966+
}
1967+
1968+
/*
1969+
* The engine and its registers (and workarounds in case of render)
1970+
* have been reset to their default values. Follow the init_ring
1971+
* process to program RING_MODE, HWSP and re-enable submission.
1972+
*/
1973+
ret = engine->init_hw(engine);
1974+
1975+
out:
1976+
return ret;
19311977
}
19321978

19331979
static int i915_pm_suspend(struct device *kdev)

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3474,11 +3474,16 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
34743474
return READ_ONCE(error->reset_count);
34753475
}
34763476

3477+
struct drm_i915_gem_request *
3478+
i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
34773479
int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
34783480
void i915_gem_reset(struct drm_i915_private *dev_priv);
3481+
void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
34793482
void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
34803483
void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
34813484
bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
3485+
void i915_gem_reset_engine(struct intel_engine_cs *engine,
3486+
struct drm_i915_gem_request *request);
34823487

34833488
void i915_gem_init_mmio(struct drm_i915_private *i915);
34843489
int __must_check i915_gem_init(struct drm_i915_private *dev_priv);

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,45 +2831,61 @@ static bool engine_stalled(struct intel_engine_cs *engine)
28312831
return true;
28322832
}
28332833

2834+
/*
2835+
* Ensure irq handler finishes, and not run again.
2836+
* Also return the active request so that we only search for it once.
2837+
*/
2838+
struct drm_i915_gem_request *
2839+
i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
2840+
{
2841+
struct drm_i915_gem_request *request = NULL;
2842+
2843+
/* Prevent the signaler thread from updating the request
2844+
* state (by calling dma_fence_signal) as we are processing
2845+
* the reset. The write from the GPU of the seqno is
2846+
* asynchronous and the signaler thread may see a different
2847+
* value to us and declare the request complete, even though
2848+
* the reset routine have picked that request as the active
2849+
* (incomplete) request. This conflict is not handled
2850+
* gracefully!
2851+
*/
2852+
kthread_park(engine->breadcrumbs.signaler);
2853+
2854+
/* Prevent request submission to the hardware until we have
2855+
* completed the reset in i915_gem_reset_finish(). If a request
2856+
* is completed by one engine, it may then queue a request
2857+
* to a second via its engine->irq_tasklet *just* as we are
2858+
* calling engine->init_hw() and also writing the ELSP.
2859+
* Turning off the engine->irq_tasklet until the reset is over
2860+
* prevents the race.
2861+
*/
2862+
tasklet_kill(&engine->irq_tasklet);
2863+
tasklet_disable(&engine->irq_tasklet);
2864+
2865+
if (engine->irq_seqno_barrier)
2866+
engine->irq_seqno_barrier(engine);
2867+
2868+
if (engine_stalled(engine)) {
2869+
request = i915_gem_find_active_request(engine);
2870+
if (request && request->fence.error == -EIO)
2871+
request = ERR_PTR(-EIO); /* Previous reset failed! */
2872+
}
2873+
2874+
return request;
2875+
}
2876+
28342877
int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
28352878
{
28362879
struct intel_engine_cs *engine;
2880+
struct drm_i915_gem_request *request;
28372881
enum intel_engine_id id;
28382882
int err = 0;
28392883

2840-
/* Ensure irq handler finishes, and not run again. */
28412884
for_each_engine(engine, dev_priv, id) {
2842-
struct drm_i915_gem_request *request = NULL;
2843-
2844-
/* Prevent the signaler thread from updating the request
2845-
* state (by calling dma_fence_signal) as we are processing
2846-
* the reset. The write from the GPU of the seqno is
2847-
* asynchronous and the signaler thread may see a different
2848-
* value to us and declare the request complete, even though
2849-
* the reset routine have picked that request as the active
2850-
* (incomplete) request. This conflict is not handled
2851-
* gracefully!
2852-
*/
2853-
kthread_park(engine->breadcrumbs.signaler);
2854-
2855-
/* Prevent request submission to the hardware until we have
2856-
* completed the reset in i915_gem_reset_finish(). If a request
2857-
* is completed by one engine, it may then queue a request
2858-
* to a second via its engine->irq_tasklet *just* as we are
2859-
* calling engine->init_hw() and also writing the ELSP.
2860-
* Turning off the engine->irq_tasklet until the reset is over
2861-
* prevents the race.
2862-
*/
2863-
tasklet_kill(&engine->irq_tasklet);
2864-
tasklet_disable(&engine->irq_tasklet);
2865-
2866-
if (engine->irq_seqno_barrier)
2867-
engine->irq_seqno_barrier(engine);
2868-
2869-
if (engine_stalled(engine)) {
2870-
request = i915_gem_find_active_request(engine);
2871-
if (request && request->fence.error == -EIO)
2872-
err = -EIO; /* Previous reset failed! */
2885+
request = i915_gem_reset_prepare_engine(engine);
2886+
if (IS_ERR(request)) {
2887+
err = PTR_ERR(request);
2888+
continue;
28732889
}
28742890

28752891
engine->hangcheck.active_request = request;
@@ -2960,8 +2976,8 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
29602976
return guilty;
29612977
}
29622978

2963-
static void i915_gem_reset_engine(struct intel_engine_cs *engine,
2964-
struct drm_i915_gem_request *request)
2979+
void i915_gem_reset_engine(struct intel_engine_cs *engine,
2980+
struct drm_i915_gem_request *request)
29652981
{
29662982
if (request && i915_gem_reset_request(request)) {
29672983
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
@@ -3004,6 +3020,12 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
30043020
}
30053021
}
30063022

3023+
void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3024+
{
3025+
tasklet_enable(&engine->irq_tasklet);
3026+
kthread_unpark(engine->breadcrumbs.signaler);
3027+
}
3028+
30073029
void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
30083030
{
30093031
struct intel_engine_cs *engine;
@@ -3013,8 +3035,7 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
30133035

30143036
for_each_engine(engine, dev_priv, id) {
30153037
engine->hangcheck.active_request = NULL;
3016-
tasklet_enable(&engine->irq_tasklet);
3017-
kthread_unpark(engine->breadcrumbs.signaler);
3038+
i915_gem_reset_finish_engine(engine);
30183039
}
30193040
}
30203041

0 commit comments

Comments
 (0)