Skip to content

Commit e27ab73

Browse files
committed
drm/i915: Mark CPU cache as dirty on every transition for CPU writes
Currently, we only mark the CPU cache as dirty if we skip a clflush. This leads to some confusion where we have to ask if the object is in the write domain or missed a clflush. If we always mark the cache as dirty, this becomes a much simply question to answer. The goal remains to do as few clflushes as required and to do them as late as possible, in the hope of deferring the work to a kthread and not block the caller (e.g. execbuf, flips). v2: Always call clflush before GPU execution when the cache_dirty flag is set. This may cause some extra work on llc systems that migrate dirty buffers back and forth - but we do try to limit that by only setting cache_dirty at the end of the gpu sequence. v3: Always mark the cache as dirty upon a level change, as we need to invalidate any stale cachelines due to external writes. Reported-by: Dongwon Kim <[email protected]> Fixes: a6a7cc4 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout") Signed-off-by: Chris Wilson <[email protected]> Cc: Dongwon Kim <[email protected]> Cc: Matt Roper <[email protected]> Tested-by: Dongwon Kim <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected] Reviewed-by: Tvrtko Ursulin <[email protected]>
1 parent b8e5d2e commit e27ab73

File tree

6 files changed

+67
-56
lines changed

6 files changed

+67
-56
lines changed

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
4949

5050
static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
5151
{
52-
if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
52+
if (obj->cache_dirty)
5353
return false;
5454

5555
if (!i915_gem_object_is_coherent(obj))
@@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
233233
return st;
234234
}
235235

236+
static void __start_cpu_write(struct drm_i915_gem_object *obj)
237+
{
238+
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
239+
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
240+
if (cpu_write_needs_clflush(obj))
241+
obj->cache_dirty = true;
242+
}
243+
236244
static void
237245
__i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
238246
struct sg_table *pages,
@@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
248256
!i915_gem_object_is_coherent(obj))
249257
drm_clflush_sg(pages);
250258

251-
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
252-
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
259+
__start_cpu_write(obj);
253260
}
254261

255262
static void
@@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
684691
args->size, &args->handle);
685692
}
686693

694+
static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
695+
{
696+
return !(obj->cache_level == I915_CACHE_NONE ||
697+
obj->cache_level == I915_CACHE_WT);
698+
}
699+
687700
/**
688701
* Creates a new mm object and returns a handle to it.
689702
* @dev: drm device pointer
@@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
753766
case I915_GEM_DOMAIN_CPU:
754767
i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
755768
break;
769+
770+
case I915_GEM_DOMAIN_RENDER:
771+
if (gpu_write_needs_clflush(obj))
772+
obj->cache_dirty = true;
773+
break;
756774
}
757775

758776
obj->base.write_domain = 0;
@@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
854872
* optimizes for the case when the gpu will dirty the data
855873
* anyway again before the next pread happens.
856874
*/
857-
if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
875+
if (!obj->cache_dirty &&
876+
!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
858877
*needs_clflush = CLFLUSH_BEFORE;
859878

860879
out:
@@ -906,14 +925,16 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
906925
* This optimizes for the case when the gpu will use the data
907926
* right away and we therefore have to clflush anyway.
908927
*/
909-
if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
928+
if (!obj->cache_dirty) {
910929
*needs_clflush |= CLFLUSH_AFTER;
911930

912-
/* Same trick applies to invalidate partially written cachelines read
913-
* before writing.
914-
*/
915-
if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
916-
*needs_clflush |= CLFLUSH_BEFORE;
931+
/*
932+
* Same trick applies to invalidate partially written
933+
* cachelines read before writing.
934+
*/
935+
if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
936+
*needs_clflush |= CLFLUSH_BEFORE;
937+
}
917938

918939
out:
919940
intel_fb_obj_invalidate(obj, ORIGIN_CPU);
@@ -3395,10 +3416,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
33953416

33963417
static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
33973418
{
3398-
if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
3399-
return;
3400-
3401-
i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3419+
/*
3420+
* We manually flush the CPU domain so that we can override and
3421+
* force the flush for the display, and perform it asyncrhonously.
3422+
*/
3423+
flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3424+
if (obj->cache_dirty)
3425+
i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
34023426
obj->base.write_domain = 0;
34033427
}
34043428

@@ -3657,13 +3681,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
36573681
}
36583682
}
36593683

3660-
if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
3661-
i915_gem_object_is_coherent(obj))
3662-
obj->cache_dirty = true;
3663-
36643684
list_for_each_entry(vma, &obj->vma_list, obj_link)
36653685
vma->node.color = cache_level;
36663686
obj->cache_level = cache_level;
3687+
obj->cache_dirty = true; /* Always invalidate stale cachelines */
36673688

36683689
return 0;
36693690
}
@@ -3885,9 +3906,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
38853906
if (ret)
38863907
return ret;
38873908

3888-
if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
3889-
return 0;
3890-
38913909
flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
38923910

38933911
/* Flush the CPU cache if it's still invalid. */
@@ -3899,15 +3917,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
38993917
/* It should now be out of any other write domains, and we can update
39003918
* the domain values for our changes.
39013919
*/
3902-
GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
3920+
GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
39033921

39043922
/* If we're writing through the CPU, then the GPU read domains will
39053923
* need to be invalidated at next use.
39063924
*/
3907-
if (write) {
3908-
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
3909-
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
3910-
}
3925+
if (write)
3926+
__start_cpu_write(obj);
39113927

39123928
return 0;
39133929
}
@@ -4328,6 +4344,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
43284344
} else
43294345
obj->cache_level = I915_CACHE_NONE;
43304346

4347+
obj->cache_dirty = !i915_gem_object_is_coherent(obj);
4348+
43314349
trace_i915_gem_object_create(obj);
43324350

43334351
return obj;
@@ -4994,10 +5012,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
49945012

49955013
mutex_lock(&dev_priv->drm.struct_mutex);
49965014
for (p = phases; *p; p++) {
4997-
list_for_each_entry(obj, *p, global_link) {
4998-
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
4999-
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
5000-
}
5015+
list_for_each_entry(obj, *p, global_link)
5016+
__start_cpu_write(obj);
50015017
}
50025018
mutex_unlock(&dev_priv->drm.struct_mutex);
50035019

drivers/gpu/drm/i915/i915_gem_clflush.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,6 @@ static const struct dma_fence_ops i915_clflush_ops = {
7171
static void __i915_do_clflush(struct drm_i915_gem_object *obj)
7272
{
7373
drm_clflush_sg(obj->mm.pages);
74-
obj->cache_dirty = false;
75-
7674
intel_fb_obj_flush(obj, ORIGIN_CPU);
7775
}
7876

@@ -81,9 +79,6 @@ static void i915_clflush_work(struct work_struct *work)
8179
struct clflush *clflush = container_of(work, typeof(*clflush), work);
8280
struct drm_i915_gem_object *obj = clflush->obj;
8381

84-
if (!obj->cache_dirty)
85-
goto out;
86-
8782
if (i915_gem_object_pin_pages(obj)) {
8883
DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
8984
goto out;
@@ -131,10 +126,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
131126
* anything not backed by physical memory we consider to be always
132127
* coherent and not need clflushing.
133128
*/
134-
if (!i915_gem_object_has_struct_page(obj))
129+
if (!i915_gem_object_has_struct_page(obj)) {
130+
obj->cache_dirty = false;
135131
return;
136-
137-
obj->cache_dirty = true;
132+
}
138133

139134
/* If the GPU is snooping the contents of the CPU cache,
140135
* we do not need to manually clear the CPU cache lines. However,
@@ -153,6 +148,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
153148
if (!(flags & I915_CLFLUSH_SYNC))
154149
clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
155150
if (clflush) {
151+
GEM_BUG_ON(!obj->cache_dirty);
152+
156153
dma_fence_init(&clflush->dma,
157154
&i915_clflush_ops,
158155
&clflush_lock,
@@ -180,4 +177,6 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
180177
} else {
181178
GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
182179
}
180+
181+
obj->cache_dirty = false;
183182
}

drivers/gpu/drm/i915/i915_gem_execbuffer.c

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
309309
return DBG_USE_CPU_RELOC > 0;
310310

311311
return (HAS_LLC(to_i915(obj->base.dev)) ||
312-
obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
312+
obj->cache_dirty ||
313313
obj->cache_level != I915_CACHE_NONE);
314314
}
315315

@@ -1110,10 +1110,8 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
11101110
if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
11111111
continue;
11121112

1113-
if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
1113+
if (obj->cache_dirty)
11141114
i915_gem_clflush_object(obj, 0);
1115-
obj->base.write_domain = 0;
1116-
}
11171115

11181116
ret = i915_gem_request_await_object
11191117
(eb->request, obj, obj->base.pending_write_domain);
@@ -1248,12 +1246,6 @@ static int eb_select_context(struct i915_execbuffer *eb)
12481246
return 0;
12491247
}
12501248

1251-
static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
1252-
{
1253-
return !(obj->cache_level == I915_CACHE_NONE ||
1254-
obj->cache_level == I915_CACHE_WT);
1255-
}
1256-
12571249
void i915_vma_move_to_active(struct i915_vma *vma,
12581250
struct drm_i915_gem_request *req,
12591251
unsigned int flags)
@@ -1277,15 +1269,16 @@ void i915_vma_move_to_active(struct i915_vma *vma,
12771269
i915_gem_active_set(&vma->last_read[idx], req);
12781270
list_move_tail(&vma->vm_link, &vma->vm->active_list);
12791271

1272+
obj->base.write_domain = 0;
12801273
if (flags & EXEC_OBJECT_WRITE) {
1274+
obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
1275+
12811276
if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
12821277
i915_gem_active_set(&obj->frontbuffer_write, req);
12831278

1284-
/* update for the implicit flush after a batch */
1285-
obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
1286-
if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
1287-
obj->cache_dirty = true;
1279+
obj->base.read_domains = 0;
12881280
}
1281+
obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
12891282

12901283
if (flags & EXEC_OBJECT_NEEDS_FENCE)
12911284
i915_gem_active_set(&vma->last_fence, req);

drivers/gpu/drm/i915/i915_gem_internal.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
188188
drm_gem_private_object_init(&i915->drm, &obj->base, size);
189189
i915_gem_object_init(obj, &i915_gem_object_internal_ops);
190190

191-
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
192191
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
192+
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
193193
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
194+
obj->cache_dirty = !i915_gem_object_is_coherent(obj);
194195

195196
return obj;
196197
}

drivers/gpu/drm/i915/i915_gem_userptr.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
802802

803803
drm_gem_private_object_init(dev, &obj->base, args->user_size);
804804
i915_gem_object_init(obj, &i915_gem_userptr_ops);
805-
obj->cache_level = I915_CACHE_LLC;
806-
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
807805
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
806+
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
807+
obj->cache_level = I915_CACHE_LLC;
808+
obj->cache_dirty = !i915_gem_object_is_coherent(obj);
808809

809810
obj->userptr.ptr = args->user_ptr;
810811
obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);

drivers/gpu/drm/i915/selftests/huge_gem_object.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915,
126126
drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
127127
i915_gem_object_init(obj, &huge_ops);
128128

129-
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
130129
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
130+
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
131131
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
132+
obj->cache_dirty = !i915_gem_object_is_coherent(obj);
132133
obj->scratch = phys_size;
133134

134135
return obj;

0 commit comments

Comments
 (0)