Skip to content

Commit 4ff4b44

Browse files
committed
drm/i915: Store a direct lookup from object handle to vma
The advent of full-ppgtt lead to an extra indirection between the object and its binding. That extra indirection has a noticeable impact on how fast we can convert from the user handles to our internal vma for execbuffer. In order to bypass the extra indirection, we use a resizable hashtable to jump from the object to the per-ctx vma. rhashtable was considered but we don't need the online resizing feature and the extra complexity proved to undermine its usefulness. Instead, we simply reallocate the hastable on demand in a background task and serialize it before iterating. In non-full-ppgtt modes, multiple files and multiple contexts can share the same vma. This leads to having multiple possible handle->vma links, so we only use the first to establish the fast path. The majority of buffers are not shared and so we should still be able to realise speedups with multiple clients. v2: Prettier names, more magic. v3: Many style tweaks, most notably hiding the misuse of execobj[].rsvd2 Signed-off-by: Chris Wilson <[email protected]> Reviewed-by: Joonas Lahtinen <[email protected]>
1 parent 4c9c0d0 commit 4ff4b44

File tree

11 files changed

+322
-112
lines changed

11 files changed

+322
-112
lines changed

drivers/gpu/drm/i915/i915_debugfs.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1998,6 +1998,12 @@ static int i915_context_status(struct seq_file *m, void *unused)
19981998
seq_putc(m, '\n');
19991999
}
20002000

2001+
seq_printf(m,
2002+
"\tvma hashtable size=%u (actual %lu), count=%u\n",
2003+
ctx->vma_lut.ht_size,
2004+
BIT(ctx->vma_lut.ht_bits),
2005+
ctx->vma_lut.ht_count);
2006+
20012007
seq_putc(m, '\n');
20022008
}
20032009

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
#include <linux/i2c.h>
3838
#include <linux/i2c-algo-bit.h>
3939
#include <linux/backlight.h>
40-
#include <linux/hashtable.h>
40+
#include <linux/hash.h>
4141
#include <linux/intel-iommu.h>
4242
#include <linux/kref.h>
4343
#include <linux/pm_qos.h>

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3261,6 +3261,10 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
32613261
if (vma->vm->file == fpriv)
32623262
i915_vma_close(vma);
32633263

3264+
vma = obj->vma_hashed;
3265+
if (vma && vma->ctx->file_priv == fpriv)
3266+
i915_vma_unlink_ctx(vma);
3267+
32643268
if (i915_gem_object_is_active(obj) &&
32653269
!i915_gem_object_has_active_reference(obj)) {
32663270
i915_gem_object_set_active_reference(obj);
@@ -4254,7 +4258,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
42544258

42554259
INIT_LIST_HEAD(&obj->global_link);
42564260
INIT_LIST_HEAD(&obj->userfault_link);
4257-
INIT_LIST_HEAD(&obj->obj_exec_link);
42584261
INIT_LIST_HEAD(&obj->vma_list);
42594262
INIT_LIST_HEAD(&obj->batch_pool_link);
42604263

drivers/gpu/drm/i915/i915_gem_context.c

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,78 @@
8585
*
8686
*/
8787

88+
#include <linux/log2.h>
8889
#include <drm/drmP.h>
8990
#include <drm/i915_drm.h>
9091
#include "i915_drv.h"
9192
#include "i915_trace.h"
9293

9394
#define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
9495

96+
/* Initial size (as log2) to preallocate the handle->object hashtable */
97+
#define VMA_HT_BITS 2u /* 4 x 2 pointers, 64 bytes minimum */
98+
99+
static void resize_vma_ht(struct work_struct *work)
100+
{
101+
struct i915_gem_context_vma_lut *lut =
102+
container_of(work, typeof(*lut), resize);
103+
unsigned int bits, new_bits, size, i;
104+
struct hlist_head *new_ht;
105+
106+
GEM_BUG_ON(!(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS));
107+
108+
bits = 1 + ilog2(4*lut->ht_count/3 + 1);
109+
new_bits = min_t(unsigned int,
110+
max(bits, VMA_HT_BITS),
111+
sizeof(unsigned int) * BITS_PER_BYTE - 1);
112+
if (new_bits == lut->ht_bits)
113+
goto out;
114+
115+
new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
116+
if (!new_ht)
117+
new_ht = vzalloc(sizeof(*new_ht)<<new_bits);
118+
if (!new_ht)
119+
/* Pretend resize succeeded and stop calling us for a bit! */
120+
goto out;
121+
122+
size = BIT(lut->ht_bits);
123+
for (i = 0; i < size; i++) {
124+
struct i915_vma *vma;
125+
struct hlist_node *tmp;
126+
127+
hlist_for_each_entry_safe(vma, tmp, &lut->ht[i], ctx_node)
128+
hlist_add_head(&vma->ctx_node,
129+
&new_ht[hash_32(vma->ctx_handle,
130+
new_bits)]);
131+
}
132+
kvfree(lut->ht);
133+
lut->ht = new_ht;
134+
lut->ht_bits = new_bits;
135+
out:
136+
smp_store_release(&lut->ht_size, BIT(bits));
137+
GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
138+
}
139+
140+
static void vma_lut_free(struct i915_gem_context *ctx)
141+
{
142+
struct i915_gem_context_vma_lut *lut = &ctx->vma_lut;
143+
unsigned int i, size;
144+
145+
if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS)
146+
cancel_work_sync(&lut->resize);
147+
148+
size = BIT(lut->ht_bits);
149+
for (i = 0; i < size; i++) {
150+
struct i915_vma *vma;
151+
152+
hlist_for_each_entry(vma, &lut->ht[i], ctx_node) {
153+
vma->obj->vma_hashed = NULL;
154+
vma->ctx = NULL;
155+
}
156+
}
157+
kvfree(lut->ht);
158+
}
159+
95160
void i915_gem_context_free(struct kref *ctx_ref)
96161
{
97162
struct i915_gem_context *ctx = container_of(ctx_ref, typeof(*ctx), ref);
@@ -101,6 +166,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
101166
trace_i915_context_free(ctx);
102167
GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
103168

169+
vma_lut_free(ctx);
104170
i915_ppgtt_put(ctx->ppgtt);
105171

106172
for (i = 0; i < I915_NUM_ENGINES; i++) {
@@ -118,6 +184,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
118184

119185
kfree(ctx->name);
120186
put_pid(ctx->pid);
187+
121188
list_del(&ctx->link);
122189

123190
ida_simple_remove(&ctx->i915->context_hw_ida, ctx->hw_id);
@@ -201,13 +268,24 @@ __create_hw_context(struct drm_i915_private *dev_priv,
201268
ctx->i915 = dev_priv;
202269
ctx->priority = I915_PRIORITY_NORMAL;
203270

271+
ctx->vma_lut.ht_bits = VMA_HT_BITS;
272+
ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
273+
BUILD_BUG_ON(BIT(VMA_HT_BITS) == I915_CTX_RESIZE_IN_PROGRESS);
274+
ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
275+
sizeof(*ctx->vma_lut.ht),
276+
GFP_KERNEL);
277+
if (!ctx->vma_lut.ht)
278+
goto err_out;
279+
280+
INIT_WORK(&ctx->vma_lut.resize, resize_vma_ht);
281+
204282
/* Default context will never have a file_priv */
205283
ret = DEFAULT_CONTEXT_HANDLE;
206284
if (file_priv) {
207285
ret = idr_alloc(&file_priv->context_idr, ctx,
208286
DEFAULT_CONTEXT_HANDLE, 0, GFP_KERNEL);
209287
if (ret < 0)
210-
goto err_out;
288+
goto err_lut;
211289
}
212290
ctx->user_handle = ret;
213291

@@ -248,6 +326,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
248326
err_pid:
249327
put_pid(ctx->pid);
250328
idr_remove(&file_priv->context_idr, ctx->user_handle);
329+
err_lut:
330+
kvfree(ctx->vma_lut.ht);
251331
err_out:
252332
context_close(ctx);
253333
return ERR_PTR(ret);

drivers/gpu/drm/i915/i915_gem_context.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,32 @@ struct i915_gem_context {
143143
/** ggtt_offset_bias: placement restriction for context objects */
144144
u32 ggtt_offset_bias;
145145

146+
struct i915_gem_context_vma_lut {
147+
/** ht_size: last request size to allocate the hashtable for. */
148+
unsigned int ht_size;
149+
#define I915_CTX_RESIZE_IN_PROGRESS BIT(0)
150+
/** ht_bits: real log2(size) of hashtable. */
151+
unsigned int ht_bits;
152+
/** ht_count: current number of entries inside the hashtable */
153+
unsigned int ht_count;
154+
155+
/** ht: the array of buckets comprising the simple hashtable */
156+
struct hlist_head *ht;
157+
158+
/**
159+
* resize: After an execbuf completes, we check the load factor
160+
* of the hashtable. If the hashtable is too full, or too empty,
161+
* we schedule a task to resize the hashtable. During the
162+
* resize, the entries are moved between different buckets and
163+
* so we cannot simultaneously read the hashtable as it is
164+
* being resized (unlike rhashtable). Therefore we treat the
165+
* active work as a strong barrier, pausing a subsequent
166+
* execbuf to wait for the resize worker to complete, if
167+
* required.
168+
*/
169+
struct work_struct resize;
170+
} vma_lut;
171+
146172
/** engine: per-engine logical HW state */
147173
struct intel_context {
148174
struct i915_vma *state;

0 commit comments

Comments
 (0)