Skip to content

Commit bb3dba3

Browse files
committed
Merge branch 'odp_rework' into rdma.git for-next
Jason Gunthorpe says: ==================== In order to hoist the interval tree code out of the drivers and into the mmu_notifiers it is necessary for the drivers to not use the interval tree for other things. This series replaces the interval tree with an xarray and along the way re-aligns all the locking to use a sensible SRCU model where the 'update' step is done by modifying an xarray. The result is overall much simpler and with less locking in the critical path. Many functions were reworked for clarity and small details like using 'imr' to refer to the implicit MR make the entire code flow here more readable. This also squashes at least two race bugs on its own, and quite possibily more that haven't been identified. ==================== Merge conflicts with the odp statistics patch resolved. * branch 'odp_rework': RDMA/odp: Remove broken debugging call to invalidate_range RDMA/mlx5: Do not race with mlx5_ib_invalidate_range during create and destroy RDMA/mlx5: Do not store implicit children in the odp_mkeys xarray RDMA/mlx5: Rework implicit ODP destroy RDMA/mlx5: Avoid double lookups on the pagefault path RDMA/mlx5: Reduce locking in implicit_mr_get_data() RDMA/mlx5: Use an xarray for the children of an implicit ODP RDMA/mlx5: Split implicit handling from pagefault_mr RDMA/mlx5: Set the HW IOVA of the child MRs to their place in the tree RDMA/mlx5: Lift implicit_mr_alloc() into the two routines that call it RDMA/mlx5: Rework implicit_mr_get_data RDMA/mlx5: Delete struct mlx5_priv->mkey_table RDMA/mlx5: Use a dedicated mkey xarray for ODP RDMA/mlx5: Split sig_err MR data into its own xarray RDMA/mlx5: Use SRCU properly in ODP prefetch Signed-off-by: Jason Gunthorpe <[email protected]>
2 parents 0363133 + 46870b2 commit bb3dba3

File tree

11 files changed

+624
-663
lines changed

11 files changed

+624
-663
lines changed

drivers/infiniband/core/umem_odp.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,6 @@ static int ib_umem_odp_map_dma_single_page(
508508
{
509509
struct ib_device *dev = umem_odp->umem.ibdev;
510510
dma_addr_t dma_addr;
511-
int remove_existing_mapping = 0;
512511
int ret = 0;
513512

514513
/*
@@ -534,28 +533,29 @@ static int ib_umem_odp_map_dma_single_page(
534533
} else if (umem_odp->page_list[page_index] == page) {
535534
umem_odp->dma_list[page_index] |= access_mask;
536535
} else {
537-
pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
538-
umem_odp->page_list[page_index], page);
539-
/* Better remove the mapping now, to prevent any further
540-
* damage. */
541-
remove_existing_mapping = 1;
536+
/*
537+
* This is a race here where we could have done:
538+
*
539+
* CPU0 CPU1
540+
* get_user_pages()
541+
* invalidate()
542+
* page_fault()
543+
* mutex_lock(umem_mutex)
544+
* page from GUP != page in ODP
545+
*
546+
* It should be prevented by the retry test above as reading
547+
* the seq number should be reliable under the
548+
* umem_mutex. Thus something is really not working right if
549+
* things get here.
550+
*/
551+
WARN(true,
552+
"Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
553+
umem_odp->page_list[page_index], page);
554+
ret = -EAGAIN;
542555
}
543556

544557
out:
545558
put_user_page(page);
546-
547-
if (remove_existing_mapping) {
548-
ib_umem_notifier_start_account(umem_odp);
549-
dev->ops.invalidate_range(
550-
umem_odp,
551-
ib_umem_start(umem_odp) +
552-
(page_index << umem_odp->page_shift),
553-
ib_umem_start(umem_odp) +
554-
((page_index + 1) << umem_odp->page_shift));
555-
ib_umem_notifier_end_account(umem_odp);
556-
ret = -EAGAIN;
557-
}
558-
559559
return ret;
560560
}
561561

drivers/infiniband/hw/mlx5/cq.c

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -423,9 +423,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
423423
struct mlx5_cqe64 *cqe64;
424424
struct mlx5_core_qp *mqp;
425425
struct mlx5_ib_wq *wq;
426-
struct mlx5_sig_err_cqe *sig_err_cqe;
427-
struct mlx5_core_mkey *mmkey;
428-
struct mlx5_ib_mr *mr;
429426
uint8_t opcode;
430427
uint32_t qpn;
431428
u16 wqe_ctr;
@@ -519,27 +516,29 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
519516
}
520517
}
521518
break;
522-
case MLX5_CQE_SIG_ERR:
523-
sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
519+
case MLX5_CQE_SIG_ERR: {
520+
struct mlx5_sig_err_cqe *sig_err_cqe =
521+
(struct mlx5_sig_err_cqe *)cqe64;
522+
struct mlx5_core_sig_ctx *sig;
524523

525-
xa_lock(&dev->mdev->priv.mkey_table);
526-
mmkey = xa_load(&dev->mdev->priv.mkey_table,
524+
xa_lock(&dev->sig_mrs);
525+
sig = xa_load(&dev->sig_mrs,
527526
mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
528-
mr = to_mibmr(mmkey);
529-
get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
530-
mr->sig->sig_err_exists = true;
531-
mr->sig->sigerr_count++;
527+
get_sig_err_item(sig_err_cqe, &sig->err_item);
528+
sig->sig_err_exists = true;
529+
sig->sigerr_count++;
532530

533531
mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
534-
cq->mcq.cqn, mr->sig->err_item.key,
535-
mr->sig->err_item.err_type,
536-
mr->sig->err_item.sig_err_offset,
537-
mr->sig->err_item.expected,
538-
mr->sig->err_item.actual);
532+
cq->mcq.cqn, sig->err_item.key,
533+
sig->err_item.err_type,
534+
sig->err_item.sig_err_offset,
535+
sig->err_item.expected,
536+
sig->err_item.actual);
539537

540-
xa_unlock(&dev->mdev->priv.mkey_table);
538+
xa_unlock(&dev->sig_mrs);
541539
goto repoll;
542540
}
541+
}
543542

544543
return 0;
545544
}

drivers/infiniband/hw/mlx5/devx.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,8 +1265,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
12651265
mkey->pd = MLX5_GET(mkc, mkc, pd);
12661266
devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
12671267

1268-
return xa_err(xa_store(&dev->mdev->priv.mkey_table,
1269-
mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
1268+
return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey,
1269+
GFP_KERNEL));
12701270
}
12711271

12721272
static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1345,9 +1345,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
13451345
* the mmkey, we must wait for that to stop before freeing the
13461346
* mkey, as another allocation could get the same mkey #.
13471347
*/
1348-
xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
1348+
xa_erase(&obj->ib_dev->odp_mkeys,
13491349
mlx5_base_mkey(obj->devx_mr.mmkey.key));
1350-
synchronize_srcu(&dev->mr_srcu);
1350+
synchronize_srcu(&dev->odp_srcu);
13511351
}
13521352

13531353
if (obj->flags & DEVX_OBJ_FLAGS_DCT)

drivers/infiniband/hw/mlx5/main.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6133,11 +6133,10 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
61336133
static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
61346134
{
61356135
mlx5_ib_cleanup_multiport_master(dev);
6136-
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
6137-
srcu_barrier(&dev->mr_srcu);
6138-
cleanup_srcu_struct(&dev->mr_srcu);
6139-
}
6136+
WARN_ON(!xa_empty(&dev->odp_mkeys));
6137+
cleanup_srcu_struct(&dev->odp_srcu);
61406138

6139+
WARN_ON(!xa_empty(&dev->sig_mrs));
61416140
WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
61426141
}
61436142

@@ -6189,15 +6188,15 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
61896188
mutex_init(&dev->cap_mask_mutex);
61906189
INIT_LIST_HEAD(&dev->qp_list);
61916190
spin_lock_init(&dev->reset_flow_resource_lock);
6191+
xa_init(&dev->odp_mkeys);
6192+
xa_init(&dev->sig_mrs);
61926193

61936194
spin_lock_init(&dev->dm.lock);
61946195
dev->dm.dev = mdev;
61956196

6196-
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
6197-
err = init_srcu_struct(&dev->mr_srcu);
6198-
if (err)
6199-
goto err_mp;
6200-
}
6197+
err = init_srcu_struct(&dev->odp_srcu);
6198+
if (err)
6199+
goto err_mp;
62016200

62026201
return 0;
62036202

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,6 @@ struct mlx5_ib_mr {
604604
struct mlx5_ib_dev *dev;
605605
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
606606
struct mlx5_core_sig_ctx *sig;
607-
unsigned int live;
608607
void *descs_alloc;
609608
int access_flags; /* Needed for rereg MR */
610609

@@ -616,12 +615,18 @@ struct mlx5_ib_mr {
616615
u64 data_iova;
617616
u64 pi_iova;
618617

619-
atomic_t num_leaf_free;
620-
wait_queue_head_t q_leaf_free;
621-
struct mlx5_async_work cb_work;
622-
atomic_t num_pending_prefetch;
618+
/* For ODP and implicit */
619+
atomic_t num_deferred_work;
620+
struct xarray implicit_children;
621+
union {
622+
struct rcu_head rcu;
623+
struct list_head elm;
624+
struct work_struct work;
625+
} odp_destroy;
623626
struct ib_odp_counters odp_stats;
624627
bool is_odp_implicit;
628+
629+
struct mlx5_async_work cb_work;
625630
};
626631

627632
static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
@@ -977,7 +982,9 @@ struct mlx5_ib_dev {
977982
* Sleepable RCU that prevents destruction of MRs while they are still
978983
* being used by a page fault handler.
979984
*/
980-
struct srcu_struct mr_srcu;
985+
struct srcu_struct odp_srcu;
986+
struct xarray odp_mkeys;
987+
981988
u32 null_mkey;
982989
struct mlx5_ib_flow_db *flow_db;
983990
/* protect resources needed as part of reset flow */
@@ -999,6 +1006,8 @@ struct mlx5_ib_dev {
9991006
struct mlx5_srq_table srq_table;
10001007
struct mlx5_async_ctx async_ctx;
10011008
struct mlx5_devx_event_table devx_event_table;
1009+
1010+
struct xarray sig_mrs;
10021011
};
10031012

10041013
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1162,6 +1171,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
11621171
struct ib_udata *udata,
11631172
int access_flags);
11641173
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
1174+
void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
11651175
int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
11661176
u64 length, u64 virt_addr, int access_flags,
11671177
struct ib_pd *pd, struct ib_udata *udata);
@@ -1223,6 +1233,8 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
12231233

12241234
struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
12251235
void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
1236+
int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);
1237+
12261238
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
12271239
struct ib_mr_status *mr_status);
12281240
struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,

0 commit comments

Comments
 (0)