Skip to content

Commit 7dd4f67

Browse files
committed
drm/i915: Async GPU relocation processing
If the user requires patching of their batch or auxiliary buffers, we currently make the alterations on the cpu. If they are active on the GPU at the time, we wait under the struct_mutex for them to finish executing before we rewrite the contents. This happens if shared relocation trees are used between different contexts with separate address space (and the buffers then have different addresses in each), the 3D state will need to be adjusted between execution on each context. However, we don't need to use the CPU to do the relocation patching, as we could queue commands to the GPU to perform it and use fences to serialise the operation with the current activity and future - so the operation on the GPU appears just as atomic as performing it immediately. Performing the relocation rewrites on the GPU is not free, in terms of pure throughput, the number of relocations/s is about halved - but more importantly so is the time under the struct_mutex. v2: Break out the request/batch allocation for clearer error flow. v3: A few asserts to ensure rq ordering is maintained Signed-off-by: Chris Wilson <[email protected]> Reviewed-by: Joonas Lahtinen <[email protected]>
1 parent 1a71cf2 commit 7dd4f67

File tree

2 files changed

+220
-8
lines changed

2 files changed

+220
-8
lines changed

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4397,7 +4397,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
43974397
GEM_BUG_ON(i915_gem_object_is_active(obj));
43984398
list_for_each_entry_safe(vma, vn,
43994399
&obj->vma_list, obj_link) {
4400-
GEM_BUG_ON(!i915_vma_is_ggtt(vma));
44014400
GEM_BUG_ON(i915_vma_is_active(vma));
44024401
vma->flags &= ~I915_VMA_PIN_MASK;
44034402
i915_vma_close(vma);

drivers/gpu/drm/i915/i915_gem_execbuffer.c

Lines changed: 220 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,12 @@
4040
#include "intel_drv.h"
4141
#include "intel_frontbuffer.h"
4242

43-
#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
43+
enum {
44+
FORCE_CPU_RELOC = 1,
45+
FORCE_GTT_RELOC,
46+
FORCE_GPU_RELOC,
47+
#define DBG_FORCE_RELOC 0 /* choose one of the above! */
48+
};
4449

4550
#define __EXEC_OBJECT_HAS_REF BIT(31)
4651
#define __EXEC_OBJECT_HAS_PIN BIT(30)
@@ -212,10 +217,15 @@ struct i915_execbuffer {
212217
struct drm_mm_node node; /** temporary GTT binding */
213218
unsigned long vaddr; /** Current kmap address */
214219
unsigned long page; /** Currently mapped page index */
220+
unsigned int gen; /** Cached value of INTEL_GEN */
215221
bool use_64bit_reloc : 1;
216222
bool has_llc : 1;
217223
bool has_fence : 1;
218224
bool needs_unfenced : 1;
225+
226+
struct drm_i915_gem_request *rq;
227+
u32 *rq_cmd;
228+
unsigned int rq_size;
219229
} reloc_cache;
220230

221231
u64 invalid_flags; /** Set of execobj.flags that are invalid */
@@ -496,8 +506,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
496506
if (!i915_gem_object_has_struct_page(obj))
497507
return false;
498508

499-
if (DBG_USE_CPU_RELOC)
500-
return DBG_USE_CPU_RELOC > 0;
509+
if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
510+
return true;
511+
512+
if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
513+
return false;
501514

502515
return (cache->has_llc ||
503516
obj->cache_dirty ||
@@ -887,6 +900,8 @@ static void eb_reset_vmas(const struct i915_execbuffer *eb)
887900

888901
static void eb_destroy(const struct i915_execbuffer *eb)
889902
{
903+
GEM_BUG_ON(eb->reloc_cache.rq);
904+
890905
if (eb->lut_size >= 0)
891906
kfree(eb->buckets);
892907
}
@@ -904,11 +919,14 @@ static void reloc_cache_init(struct reloc_cache *cache,
904919
cache->page = -1;
905920
cache->vaddr = 0;
906921
/* Must be a variable in the struct to allow GCC to unroll. */
922+
cache->gen = INTEL_GEN(i915);
907923
cache->has_llc = HAS_LLC(i915);
908-
cache->has_fence = INTEL_GEN(i915) < 4;
909-
cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
910924
cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
925+
cache->has_fence = cache->gen < 4;
926+
cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
911927
cache->node.allocated = false;
928+
cache->rq = NULL;
929+
cache->rq_size = 0;
912930
}
913931

914932
static inline void *unmask_page(unsigned long p)
@@ -930,10 +948,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
930948
return &i915->ggtt;
931949
}
932950

951+
static void reloc_gpu_flush(struct reloc_cache *cache)
952+
{
953+
GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
954+
cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
955+
i915_gem_object_unpin_map(cache->rq->batch->obj);
956+
i915_gem_chipset_flush(cache->rq->i915);
957+
958+
__i915_add_request(cache->rq, true);
959+
cache->rq = NULL;
960+
}
961+
933962
static void reloc_cache_reset(struct reloc_cache *cache)
934963
{
935964
void *vaddr;
936965

966+
if (cache->rq)
967+
reloc_gpu_flush(cache);
968+
937969
if (!cache->vaddr)
938970
return;
939971

@@ -1099,6 +1131,121 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
10991131
*addr = value;
11001132
}
11011133

1134+
static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
1135+
struct i915_vma *vma,
1136+
unsigned int len)
1137+
{
1138+
struct reloc_cache *cache = &eb->reloc_cache;
1139+
struct drm_i915_gem_object *obj;
1140+
struct drm_i915_gem_request *rq;
1141+
struct i915_vma *batch;
1142+
u32 *cmd;
1143+
int err;
1144+
1145+
GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
1146+
1147+
obj = i915_gem_batch_pool_get(&eb->engine->batch_pool, PAGE_SIZE);
1148+
if (IS_ERR(obj))
1149+
return PTR_ERR(obj);
1150+
1151+
cmd = i915_gem_object_pin_map(obj,
1152+
cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
1153+
i915_gem_object_unpin_pages(obj);
1154+
if (IS_ERR(cmd))
1155+
return PTR_ERR(cmd);
1156+
1157+
err = i915_gem_object_set_to_wc_domain(obj, false);
1158+
if (err)
1159+
goto err_unmap;
1160+
1161+
batch = i915_vma_instance(obj, vma->vm, NULL);
1162+
if (IS_ERR(batch)) {
1163+
err = PTR_ERR(batch);
1164+
goto err_unmap;
1165+
}
1166+
1167+
err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
1168+
if (err)
1169+
goto err_unmap;
1170+
1171+
rq = i915_gem_request_alloc(eb->engine, eb->ctx);
1172+
if (IS_ERR(rq)) {
1173+
err = PTR_ERR(rq);
1174+
goto err_unpin;
1175+
}
1176+
1177+
err = i915_gem_request_await_object(rq, vma->obj, true);
1178+
if (err)
1179+
goto err_request;
1180+
1181+
err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
1182+
if (err)
1183+
goto err_request;
1184+
1185+
err = i915_switch_context(rq);
1186+
if (err)
1187+
goto err_request;
1188+
1189+
err = eb->engine->emit_bb_start(rq,
1190+
batch->node.start, PAGE_SIZE,
1191+
cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
1192+
if (err)
1193+
goto err_request;
1194+
1195+
GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv, true));
1196+
i915_vma_move_to_active(batch, rq, 0);
1197+
reservation_object_lock(obj->resv, NULL);
1198+
reservation_object_add_excl_fence(obj->resv, &rq->fence);
1199+
reservation_object_unlock(obj->resv);
1200+
i915_vma_unpin(batch);
1201+
1202+
i915_vma_move_to_active(vma, rq, true);
1203+
reservation_object_lock(vma->obj->resv, NULL);
1204+
reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
1205+
reservation_object_unlock(vma->obj->resv);
1206+
1207+
rq->batch = batch;
1208+
1209+
cache->rq = rq;
1210+
cache->rq_cmd = cmd;
1211+
cache->rq_size = 0;
1212+
1213+
/* Return with batch mapping (cmd) still pinned */
1214+
return 0;
1215+
1216+
err_request:
1217+
i915_add_request(rq);
1218+
err_unpin:
1219+
i915_vma_unpin(batch);
1220+
err_unmap:
1221+
i915_gem_object_unpin_map(obj);
1222+
return err;
1223+
}
1224+
1225+
static u32 *reloc_gpu(struct i915_execbuffer *eb,
1226+
struct i915_vma *vma,
1227+
unsigned int len)
1228+
{
1229+
struct reloc_cache *cache = &eb->reloc_cache;
1230+
u32 *cmd;
1231+
1232+
if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
1233+
reloc_gpu_flush(cache);
1234+
1235+
if (unlikely(!cache->rq)) {
1236+
int err;
1237+
1238+
err = __reloc_gpu_alloc(eb, vma, len);
1239+
if (unlikely(err))
1240+
return ERR_PTR(err);
1241+
}
1242+
1243+
cmd = cache->rq_cmd + cache->rq_size;
1244+
cache->rq_size += len;
1245+
1246+
return cmd;
1247+
}
1248+
11021249
static u64
11031250
relocate_entry(struct i915_vma *vma,
11041251
const struct drm_i915_gem_relocation_entry *reloc,
@@ -1111,6 +1258,67 @@ relocate_entry(struct i915_vma *vma,
11111258
bool wide = eb->reloc_cache.use_64bit_reloc;
11121259
void *vaddr;
11131260

1261+
if (!eb->reloc_cache.vaddr &&
1262+
(DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
1263+
!reservation_object_test_signaled_rcu(obj->resv, true))) {
1264+
const unsigned int gen = eb->reloc_cache.gen;
1265+
unsigned int len;
1266+
u32 *batch;
1267+
u64 addr;
1268+
1269+
if (wide)
1270+
len = offset & 7 ? 8 : 5;
1271+
else if (gen >= 4)
1272+
len = 4;
1273+
else if (gen >= 3)
1274+
len = 3;
1275+
else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
1276+
goto repeat;
1277+
1278+
batch = reloc_gpu(eb, vma, len);
1279+
if (IS_ERR(batch))
1280+
goto repeat;
1281+
1282+
addr = gen8_canonical_addr(vma->node.start + offset);
1283+
if (wide) {
1284+
if (offset & 7) {
1285+
*batch++ = MI_STORE_DWORD_IMM_GEN4;
1286+
*batch++ = lower_32_bits(addr);
1287+
*batch++ = upper_32_bits(addr);
1288+
*batch++ = lower_32_bits(target_offset);
1289+
1290+
addr = gen8_canonical_addr(addr + 4);
1291+
1292+
*batch++ = MI_STORE_DWORD_IMM_GEN4;
1293+
*batch++ = lower_32_bits(addr);
1294+
*batch++ = upper_32_bits(addr);
1295+
*batch++ = upper_32_bits(target_offset);
1296+
} else {
1297+
*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
1298+
*batch++ = lower_32_bits(addr);
1299+
*batch++ = upper_32_bits(addr);
1300+
*batch++ = lower_32_bits(target_offset);
1301+
*batch++ = upper_32_bits(target_offset);
1302+
}
1303+
} else if (gen >= 6) {
1304+
*batch++ = MI_STORE_DWORD_IMM_GEN4;
1305+
*batch++ = 0;
1306+
*batch++ = addr;
1307+
*batch++ = target_offset;
1308+
} else if (gen >= 4) {
1309+
*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1310+
*batch++ = 0;
1311+
*batch++ = addr;
1312+
*batch++ = target_offset;
1313+
} else {
1314+
*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
1315+
*batch++ = addr;
1316+
*batch++ = target_offset;
1317+
}
1318+
1319+
goto out;
1320+
}
1321+
11141322
repeat:
11151323
vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
11161324
if (IS_ERR(vaddr))
@@ -1127,6 +1335,7 @@ relocate_entry(struct i915_vma *vma,
11271335
goto repeat;
11281336
}
11291337

1338+
out:
11301339
return target->node.start | UPDATE;
11311340
}
11321341

@@ -1189,7 +1398,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
11891398
* If the relocation already has the right value in it, no
11901399
* more work needs to be done.
11911400
*/
1192-
if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
1401+
if (!DBG_FORCE_RELOC &&
1402+
gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
11931403
return 0;
11941404

11951405
/* Check that the relocation address is valid... */
@@ -1915,7 +2125,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
19152125
eb.i915 = to_i915(dev);
19162126
eb.file = file;
19172127
eb.args = args;
1918-
if (!(args->flags & I915_EXEC_NO_RELOC))
2128+
if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
19192129
args->flags |= __EXEC_HAS_RELOC;
19202130
eb.exec = exec;
19212131
eb.ctx = NULL;
@@ -2068,6 +2278,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
20682278
eb.batch = vma;
20692279
}
20702280

2281+
/* All GPU relocation batches must be submitted prior to the user rq */
2282+
GEM_BUG_ON(eb.reloc_cache.rq);
2283+
20712284
/* Allocate a request for this batch buffer nice and early. */
20722285
eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
20732286
if (IS_ERR(eb.request)) {

0 commit comments

Comments
 (0)