@@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
144144 }
145145}
146146
147- static void mr_leaf_free_action (struct work_struct * work )
147+ /*
148+ * This must be called after the mr has been removed from implicit_children
149+ * and odp_mkeys and the SRCU synchronized. NOTE: The MR does not necessarily
150+ * have to be empty here, parallel page faults could have raced with the free
151+ * process and added pages to it.
152+ */
153+ static void free_implicit_child_mr (struct mlx5_ib_mr * mr , bool need_imr_xlt )
148154{
149- struct ib_umem_odp * odp = container_of (work , struct ib_umem_odp , work );
150- int idx = ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
151- struct mlx5_ib_mr * mr = odp -> private , * imr = mr -> parent ;
155+ struct mlx5_ib_mr * imr = mr -> parent ;
152156 struct ib_umem_odp * odp_imr = to_ib_umem_odp (imr -> umem );
157+ struct ib_umem_odp * odp = to_ib_umem_odp (mr -> umem );
158+ unsigned long idx = ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
153159 int srcu_key ;
154160
155- mr -> parent = NULL ;
156- synchronize_srcu ( & mr -> dev -> odp_srcu );
161+ /* implicit_child_mr's are not allowed to have deferred work */
162+ WARN_ON ( atomic_read ( & mr -> num_deferred_work ) );
157163
158- if (xa_load ( & mr -> dev -> odp_mkeys , mlx5_base_mkey ( imr -> mmkey . key )) ) {
164+ if (need_imr_xlt ) {
159165 srcu_key = srcu_read_lock (& mr -> dev -> odp_srcu );
160166 mutex_lock (& odp_imr -> umem_mutex );
161- mlx5_ib_update_xlt (imr , idx , 1 , 0 ,
167+ mlx5_ib_update_xlt (mr -> parent , idx , 1 , 0 ,
162168 MLX5_IB_UPD_XLT_INDIRECT |
163169 MLX5_IB_UPD_XLT_ATOMIC );
164170 mutex_unlock (& odp_imr -> umem_mutex );
165171 srcu_read_unlock (& mr -> dev -> odp_srcu , srcu_key );
166172 }
167- ib_umem_odp_release (odp );
173+
174+ mr -> parent = NULL ;
168175 mlx5_mr_cache_free (mr -> dev , mr );
176+ ib_umem_odp_release (odp );
177+ atomic_dec (& imr -> num_deferred_work );
178+ }
179+
180+ static void free_implicit_child_mr_work (struct work_struct * work )
181+ {
182+ struct mlx5_ib_mr * mr =
183+ container_of (work , struct mlx5_ib_mr , odp_destroy .work );
184+
185+ free_implicit_child_mr (mr , true);
186+ }
187+
188+ static void free_implicit_child_mr_rcu (struct rcu_head * head )
189+ {
190+ struct mlx5_ib_mr * mr =
191+ container_of (head , struct mlx5_ib_mr , odp_destroy .rcu );
192+
193+ /* Freeing a MR is a sleeping operation, so bounce to a work queue */
194+ INIT_WORK (& mr -> odp_destroy .work , free_implicit_child_mr_work );
195+ queue_work (system_unbound_wq , & mr -> odp_destroy .work );
196+ }
197+
198+ static void destroy_unused_implicit_child_mr (struct mlx5_ib_mr * mr )
199+ {
200+ struct ib_umem_odp * odp = to_ib_umem_odp (mr -> umem );
201+ unsigned long idx = ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
202+ struct mlx5_ib_mr * imr = mr -> parent ;
169203
170- if (atomic_dec_and_test (& imr -> num_leaf_free ))
171- wake_up (& imr -> q_leaf_free );
204+ xa_lock (& imr -> implicit_children );
205+ /*
206+ * This can race with mlx5_ib_free_implicit_mr(), the first one to
207+ * reach the xa lock wins the race and destroys the MR.
208+ */
209+ if (__xa_cmpxchg (& imr -> implicit_children , idx , mr , NULL , GFP_ATOMIC ) !=
210+ mr )
211+ goto out_unlock ;
212+
213+ __xa_erase (& mr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ));
214+ atomic_inc (& imr -> num_deferred_work );
215+ call_srcu (& mr -> dev -> odp_srcu , & mr -> odp_destroy .rcu ,
216+ free_implicit_child_mr_rcu );
217+
218+ out_unlock :
219+ xa_unlock (& imr -> implicit_children );
172220}
173221
174222void mlx5_ib_invalidate_range (struct ib_umem_odp * umem_odp , unsigned long start ,
@@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
240288
241289 ib_umem_odp_unmap_dma_pages (umem_odp , start , end );
242290
243- if (unlikely (!umem_odp -> npages && mr -> parent &&
244- !umem_odp -> dying )) {
245- xa_erase (& mr -> parent -> implicit_children ,
246- ib_umem_start (umem_odp ) >> MLX5_IMR_MTT_SHIFT );
247- xa_erase (& mr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ));
248- umem_odp -> dying = 1 ;
249- atomic_inc (& mr -> parent -> num_leaf_free );
250- schedule_work (& umem_odp -> work );
251- }
291+ if (unlikely (!umem_odp -> npages && mr -> parent ))
292+ destroy_unused_implicit_child_mr (mr );
252293 mutex_unlock (& umem_odp -> umem_mutex );
253294}
254295
@@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
375416 mr -> mmkey .iova = idx * MLX5_IMR_MTT_SIZE ;
376417 mr -> parent = imr ;
377418 odp -> private = mr ;
378- INIT_WORK (& odp -> work , mr_leaf_free_action );
379419
380420 err = mlx5_ib_update_xlt (mr , 0 ,
381421 MLX5_IMR_MTT_ENTRIES ,
@@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
391431 * Once the store to either xarray completes any error unwind has to
392432 * use synchronize_srcu(). Avoid this with xa_reserve()
393433 */
394- ret = xa_cmpxchg (& imr -> implicit_children , idx , NULL , mr , GFP_KERNEL );
434+ ret = xa_cmpxchg (& imr -> implicit_children , idx , NULL , mr ,
435+ GFP_KERNEL );
436+ if (likely (!ret ))
437+ xa_store (& imr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ),
438+ & mr -> mmkey , GFP_ATOMIC );
395439 if (unlikely (ret )) {
396440 if (xa_is_err (ret )) {
397441 ret = ERR_PTR (xa_err (ret ));
@@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
404448 goto out_release ;
405449 }
406450
407- xa_store (& imr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ),
408- & mr -> mmkey , GFP_ATOMIC );
409-
410451 mlx5_ib_dbg (imr -> dev , "key %x mr %p\n" , mr -> mmkey .key , mr );
411452 return mr ;
412453
@@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
445486 imr -> ibmr .lkey = imr -> mmkey .key ;
446487 imr -> ibmr .rkey = imr -> mmkey .key ;
447488 imr -> umem = & umem_odp -> umem ;
448- init_waitqueue_head (& imr -> q_leaf_free );
449- atomic_set (& imr -> num_leaf_free , 0 );
450- atomic_set (& imr -> num_pending_prefetch , 0 );
489+ atomic_set (& imr -> num_deferred_work , 0 );
451490 xa_init (& imr -> implicit_children );
452491
453492 err = mlx5_ib_update_xlt (imr , 0 ,
@@ -477,35 +516,48 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
477516void mlx5_ib_free_implicit_mr (struct mlx5_ib_mr * imr )
478517{
479518 struct ib_umem_odp * odp_imr = to_ib_umem_odp (imr -> umem );
519+ struct mlx5_ib_dev * dev = imr -> dev ;
520+ struct list_head destroy_list ;
480521 struct mlx5_ib_mr * mtt ;
522+ struct mlx5_ib_mr * tmp ;
481523 unsigned long idx ;
482524
483- mutex_lock (& odp_imr -> umem_mutex );
484- xa_for_each (& imr -> implicit_children , idx , mtt ) {
485- struct ib_umem_odp * umem_odp = to_ib_umem_odp (mtt -> umem );
525+ INIT_LIST_HEAD (& destroy_list );
486526
487- xa_erase (& imr -> implicit_children , idx );
527+ xa_erase (& dev -> odp_mkeys , mlx5_base_mkey (imr -> mmkey .key ));
528+ /*
529+ * This stops the SRCU protected page fault path from touching either
530+ * the imr or any children. The page fault path can only reach the
531+ * children xarray via the imr.
532+ */
533+ synchronize_srcu (& dev -> odp_srcu );
488534
489- mutex_lock (& umem_odp -> umem_mutex );
490- ib_umem_odp_unmap_dma_pages (umem_odp , ib_umem_start (umem_odp ),
491- ib_umem_end (umem_odp ));
535+ xa_lock (& imr -> implicit_children );
536+ xa_for_each (& imr -> implicit_children , idx , mtt ) {
537+ __xa_erase (& imr -> implicit_children , idx );
538+ __xa_erase (& dev -> odp_mkeys , mlx5_base_mkey (mtt -> mmkey .key ));
539+ list_add (& mtt -> odp_destroy .elm , & destroy_list );
540+ }
541+ xa_unlock (& imr -> implicit_children );
492542
493- if (umem_odp -> dying ) {
494- mutex_unlock (& umem_odp -> umem_mutex );
495- continue ;
496- }
543+ /* Fence access to the child pointers via the pagefault thread */
544+ synchronize_srcu (& dev -> odp_srcu );
497545
498- umem_odp -> dying = 1 ;
499- atomic_inc (& imr -> num_leaf_free );
500- schedule_work (& umem_odp -> work );
501- mutex_unlock (& umem_odp -> umem_mutex );
546+ /*
547+ * num_deferred_work can only be incremented inside the odp_srcu, or
548+ * under xa_lock while the child is in the xarray. Thus at this point
549+ * it is only decreasing, and all work holding it is now on the wq.
550+ */
551+ if (atomic_read (& imr -> num_deferred_work )) {
552+ flush_workqueue (system_unbound_wq );
553+ WARN_ON (atomic_read (& imr -> num_deferred_work ));
502554 }
503- mutex_unlock (& odp_imr -> umem_mutex );
504555
505- wait_event (imr -> q_leaf_free , !atomic_read (& imr -> num_leaf_free ));
506- WARN_ON (!xa_empty (& imr -> implicit_children ));
507- /* Remove any left over reserved elements */
508- xa_destroy (& imr -> implicit_children );
556+ list_for_each_entry_safe (mtt , tmp , & destroy_list , odp_destroy .elm )
557+ free_implicit_child_mr (mtt , false);
558+
559+ mlx5_mr_cache_free (dev , imr );
560+ ib_umem_odp_release (odp_imr );
509561}
510562
511563#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
@@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
15791631 u32 i ;
15801632
15811633 for (i = 0 ; i < work -> num_sge ; ++ i )
1582- atomic_dec (& work -> frags [i ].mr -> num_pending_prefetch );
1634+ atomic_dec (& work -> frags [i ].mr -> num_deferred_work );
15831635 kvfree (work );
15841636}
15851637
@@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd,
16581710 }
16591711
16601712 /* Keep the MR pointer will valid outside the SRCU */
1661- atomic_inc (& work -> frags [i ].mr -> num_pending_prefetch );
1713+ atomic_inc (& work -> frags [i ].mr -> num_deferred_work );
16621714 }
16631715 work -> num_sge = num_sge ;
16641716 return true;
0 commit comments