Skip to content

Commit 77c05d2

Browse files
committed
Merge branch 'page_pool-API-for-numa-node-change-handling'
Saeed Mahameed says: ==================== page_pool: API for numa node change handling This series extends page pool API to allow page pool consumers to update page pool numa node on the fly. This is required since on some systems, rx rings irqs can migrate between numa nodes, due to irq balancer or user defined scripts, current page pool has no way to know of such migration and will keep allocating and holding on to pages from a wrong numa node, which is bad for the consumer performance. 1) Add API to update numa node id of the page pool Consumers will call this API to update the page pool numa node id. 2) Don't recycle non-reusable pages: Page pool will check upon page return whether a page is suitable for recycling or not. 2.1) when it belongs to a different num node. 2.2) when it was allocated under memory pressure. 3) mlx5 will use the new API to update page pool numa id on demand. The series is a joint work between me and Jonathan, we tested it and it proved itself worthy to avoid page allocator bottlenecks and improve packet rate and cpu utilization significantly for the described scenarios above. Performance testing: XDP drop/tx rate and TCP single/multi stream, on mlx5 driver while migrating rx ring irq from close to far numa: mlx5 internal page cache was locally disabled to get pure page pool results. CPU: Intel(R) Xeon(R) CPU E5-2603 v4 @ 1.70GHz NIC: Mellanox Technologies MT27700 Family [ConnectX-4] (100G) XDP Drop/TX single core: NUMA | XDP | Before | After --------------------------------------- Close | Drop | 11 Mpps | 10.9 Mpps Far | Drop | 4.4 Mpps | 5.8 Mpps Close | TX | 6.5 Mpps | 6.5 Mpps Far | TX | 3.5 Mpps | 4 Mpps Improvement is about 30% drop packet rate, 15% tx packet rate for numa far test. No degradation for numa close tests. TCP single/multi cpu/stream: NUMA | #cpu | Before | After -------------------------------------- Close | 1 | 18 Gbps | 18 Gbps Far | 1 | 15 Gbps | 18 Gbps Close | 12 | 80 Gbps | 80 Gbps Far | 12 | 68 Gbps | 80 Gbps In all test cases we see improvement for the far numa case, and no impact on the close numa case. ================== Performance analysis and conclusions by Jesper [1]: Impact on XDP drop x86_64 is inconclusive and shows only 0.3459ns slow-down, as this is below measurement accuracy of system. v2->v3: - Rebase on top of latest net-next and Jesper's page pool object release patchset [2] - No code changes - Performance analysis by Jesper added to the cover letter. v1->v2: - Drop last patch, as requested by Ilias and Jesper. - Fix documentation's performance numbers order. [1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool04_inflight_changes.org#performance-notes [2] https://patchwork.ozlabs.org/cover/1192098/ ==================== Acked-by: Jesper Dangaard Brouer <[email protected]> Signed-off-by: David S. Miller <[email protected]>
2 parents 1f12177 + 6849c6d commit 77c05d2

File tree

4 files changed

+53
-1
lines changed

4 files changed

+53
-1
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,9 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
13861386
if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
13871387
return 0;
13881388

1389+
if (rq->page_pool)
1390+
page_pool_nid_changed(rq->page_pool, numa_mem_id());
1391+
13891392
if (rq->cqd.left) {
13901393
work_done += mlx5e_decompress_cqes_cont(rq, cqwq, 0, budget);
13911394
if (rq->cqd.left || work_done >= budget)

include/net/page_pool.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,4 +204,11 @@ static inline bool page_pool_put(struct page_pool *pool)
204204
return refcount_dec_and_test(&pool->user_cnt);
205205
}
206206

207+
/* Caller must provide appropriate safe context, e.g. NAPI. */
208+
void page_pool_update_nid(struct page_pool *pool, int new_nid);
209+
static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
210+
{
211+
if (unlikely(pool->p.nid != new_nid))
212+
page_pool_update_nid(pool, new_nid);
213+
}
207214
#endif /* _NET_PAGE_POOL_H */

include/trace/events/page_pool.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,28 @@ TRACE_EVENT(page_pool_state_hold,
8989
__entry->pool, __entry->page, __entry->pfn, __entry->hold)
9090
);
9191

92+
TRACE_EVENT(page_pool_update_nid,
93+
94+
TP_PROTO(const struct page_pool *pool, int new_nid),
95+
96+
TP_ARGS(pool, new_nid),
97+
98+
TP_STRUCT__entry(
99+
__field(const struct page_pool *, pool)
100+
__field(int, pool_nid)
101+
__field(int, new_nid)
102+
),
103+
104+
TP_fast_assign(
105+
__entry->pool = pool;
106+
__entry->pool_nid = pool->p.nid;
107+
__entry->new_nid = new_nid;
108+
),
109+
110+
TP_printk("page_pool=%p pool_nid=%d new_nid=%d",
111+
__entry->pool, __entry->pool_nid, __entry->new_nid)
112+
);
113+
92114
#endif /* _TRACE_PAGE_POOL_H */
93115

94116
/* This part must be outside protection */

net/core/page_pool.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,17 @@ static bool __page_pool_recycle_direct(struct page *page,
281281
return true;
282282
}
283283

284+
/* page is NOT reusable when:
285+
* 1) allocated when system is under some pressure. (page_is_pfmemalloc)
286+
* 2) belongs to a different NUMA node than pool->p.nid.
287+
*
288+
* To update pool->p.nid users must call page_pool_update_nid.
289+
*/
290+
static bool pool_page_reusable(struct page_pool *pool, struct page *page)
291+
{
292+
return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid;
293+
}
294+
284295
void __page_pool_put_page(struct page_pool *pool,
285296
struct page *page, bool allow_direct)
286297
{
@@ -290,7 +301,8 @@ void __page_pool_put_page(struct page_pool *pool,
290301
*
291302
* refcnt == 1 means page_pool owns page, and can recycle it.
292303
*/
293-
if (likely(page_ref_count(page) == 1)) {
304+
if (likely(page_ref_count(page) == 1 &&
305+
pool_page_reusable(pool, page))) {
294306
/* Read barrier done in page_ref_count / READ_ONCE */
295307

296308
if (allow_direct && in_serving_softirq())
@@ -436,3 +448,11 @@ void page_pool_destroy(struct page_pool *pool)
436448
schedule_delayed_work(&pool->release_dw, DEFER_TIME);
437449
}
438450
EXPORT_SYMBOL(page_pool_destroy);
451+
452+
/* Caller must provide appropriate safe context, e.g. NAPI. */
453+
void page_pool_update_nid(struct page_pool *pool, int new_nid)
454+
{
455+
trace_page_pool_update_nid(pool, new_nid);
456+
pool->p.nid = new_nid;
457+
}
458+
EXPORT_SYMBOL(page_pool_update_nid);

0 commit comments

Comments
 (0)