Skip to content

Commit efb3e0e

Browse files
committed
Merge branch 'introduce-page_pool_alloc-related-api'
Yunsheng Lin says: ==================== introduce page_pool_alloc() related API In [1] & [2] & [3], there are usecases for veth and virtio_net to use frag support in page pool to reduce memory usage, and it may request different frag size depending on the head/tail room space for xdp_frame/shinfo and mtu/packet size. When the requested frag size is large enough that a single page can not be split into more than one frag, using frag support only have performance penalty because of the extra frag count handling for frag support. So this patchset provides a page pool API for the driver to allocate memory with least memory utilization and performance penalty when it doesn't know the size of memory it need beforehand. 1. https://patchwork.kernel.org/project/netdevbpf/patch/d3ae6bd3537fbce379382ac6a42f67e22f27ece2.1683896626.git.lorenzo@kernel.org/ 2. https://patchwork.kernel.org/project/netdevbpf/patch/[email protected]/ 3. https://github.com/alobakin/linux/tree/iavf-pp-frag ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents f4dbc2b + 2d0de67 commit efb3e0e

File tree

12 files changed

+220
-58
lines changed

12 files changed

+220
-58
lines changed

Documentation/networking/page_pool.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ a page will cause no race conditions is enough.
5858

5959
.. kernel-doc:: include/net/page_pool/helpers.h
6060
:identifiers: page_pool_put_page page_pool_put_full_page
61-
page_pool_recycle_direct page_pool_dev_alloc_pages
61+
page_pool_recycle_direct page_pool_free_va
62+
page_pool_dev_alloc_pages page_pool_dev_alloc_frag
63+
page_pool_dev_alloc page_pool_dev_alloc_va
6264
page_pool_get_dma_addr page_pool_get_dma_dir
6365

6466
.. kernel-doc:: net/core/page_pool.c

drivers/net/ethernet/broadcom/bnxt/bnxt.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3302,8 +3302,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
33023302
pp.dma_dir = bp->rx_dir;
33033303
pp.max_len = PAGE_SIZE;
33043304
pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
3305-
if (PAGE_SIZE > BNXT_RX_PAGE_SIZE)
3306-
pp.flags |= PP_FLAG_PAGE_FRAG;
33073305

33083306
rxr->page_pool = page_pool_create(&pp);
33093307
if (IS_ERR(rxr->page_pool)) {

drivers/net/ethernet/hisilicon/hns3/hns3_enet.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4940,8 +4940,7 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv)
49404940
static void hns3_alloc_page_pool(struct hns3_enet_ring *ring)
49414941
{
49424942
struct page_pool_params pp_params = {
4943-
.flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG |
4944-
PP_FLAG_DMA_SYNC_DEV,
4943+
.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
49454944
.order = hns3_page_order(ring),
49464945
.pool_size = ring->desc_num * hns3_buf_size(ring) /
49474946
(PAGE_SIZE << hns3_page_order(ring)),

drivers/net/ethernet/intel/idpf/idpf_txrx.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,6 @@ static struct page_pool *idpf_rx_create_page_pool(struct idpf_queue *rxbufq)
595595
.offset = 0,
596596
};
597597

598-
if (rxbufq->rx_buf_size == IDPF_RX_BUF_2048)
599-
pp.flags |= PP_FLAG_PAGE_FRAG;
600-
601598
return page_pool_create(&pp);
602599
}
603600

drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1404,7 +1404,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
14041404
}
14051405

14061406
pp_params.order = get_order(buf_size);
1407-
pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP;
1407+
pp_params.flags = PP_FLAG_DMA_MAP;
14081408
pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs);
14091409
pp_params.nid = NUMA_NO_NODE;
14101410
pp_params.dev = pfvf->dev;

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -897,7 +897,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
897897
struct page_pool_params pp_params = { 0 };
898898

899899
pp_params.order = 0;
900-
pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_PAGE_FRAG;
900+
pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
901901
pp_params.pool_size = pool_size;
902902
pp_params.nid = node;
903903
pp_params.dev = rq->pdev;

drivers/net/veth.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -737,10 +737,11 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
737737
if (skb_shared(skb) || skb_head_is_locked(skb) ||
738738
skb_shinfo(skb)->nr_frags ||
739739
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
740-
u32 size, len, max_head_size, off;
740+
u32 size, len, max_head_size, off, truesize, page_offset;
741741
struct sk_buff *nskb;
742742
struct page *page;
743743
int i, head_off;
744+
void *va;
744745

745746
/* We need a private copy of the skb and data buffers since
746747
* the ebpf program can modify it. We segment the original skb
@@ -753,22 +754,24 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
753754
if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
754755
goto drop;
755756

757+
size = min_t(u32, skb->len, max_head_size);
758+
truesize = SKB_HEAD_ALIGN(size) + VETH_XDP_HEADROOM;
759+
756760
/* Allocate skb head */
757-
page = page_pool_dev_alloc_pages(rq->page_pool);
758-
if (!page)
761+
va = page_pool_dev_alloc_va(rq->page_pool, &truesize);
762+
if (!va)
759763
goto drop;
760764

761-
nskb = napi_build_skb(page_address(page), PAGE_SIZE);
765+
nskb = napi_build_skb(va, truesize);
762766
if (!nskb) {
763-
page_pool_put_full_page(rq->page_pool, page, true);
767+
page_pool_free_va(rq->page_pool, va, true);
764768
goto drop;
765769
}
766770

767771
skb_reserve(nskb, VETH_XDP_HEADROOM);
768772
skb_copy_header(nskb, skb);
769773
skb_mark_for_recycle(nskb);
770774

771-
size = min_t(u32, skb->len, max_head_size);
772775
if (skb_copy_bits(skb, 0, nskb->data, size)) {
773776
consume_skb(nskb);
774777
goto drop;
@@ -783,14 +786,18 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
783786
len = skb->len - off;
784787

785788
for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
786-
page = page_pool_dev_alloc_pages(rq->page_pool);
789+
size = min_t(u32, len, PAGE_SIZE);
790+
truesize = size;
791+
792+
page = page_pool_dev_alloc(rq->page_pool, &page_offset,
793+
&truesize);
787794
if (!page) {
788795
consume_skb(nskb);
789796
goto drop;
790797
}
791798

792-
size = min_t(u32, len, PAGE_SIZE);
793-
skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
799+
skb_add_rx_frag(nskb, i, page, page_offset, size,
800+
truesize);
794801
if (skb_copy_bits(skb, off, page_address(page),
795802
size)) {
796803
consume_skb(nskb);

drivers/net/wireless/mediatek/mt76/mac80211.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ int mt76_create_page_pool(struct mt76_dev *dev, struct mt76_queue *q)
570570
{
571571
struct page_pool_params pp_params = {
572572
.order = 0,
573-
.flags = PP_FLAG_PAGE_FRAG,
573+
.flags = 0,
574574
.nid = NUMA_NO_NODE,
575575
.dev = dev->dma_dev,
576576
};

include/net/page_pool/helpers.h

Lines changed: 182 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,46 @@
88
/**
99
* DOC: page_pool allocator
1010
*
11-
* The page_pool allocator is optimized for the XDP mode that
12-
* uses one frame per-page, but it can fallback on the
13-
* regular page allocator APIs.
14-
*
15-
* Basic use involves replacing alloc_pages() calls with the
16-
* page_pool_alloc_pages() call. Drivers should use
17-
* page_pool_dev_alloc_pages() replacing dev_alloc_pages().
18-
*
19-
* The API keeps track of in-flight pages, in order to let API users know
20-
* when it is safe to free a page_pool object. Thus, API users
21-
* must call page_pool_put_page() to free the page, or attach
22-
* the page to a page_pool-aware object like skbs marked with
11+
* The page_pool allocator is optimized for recycling page or page fragment used
12+
* by skb packet and xdp frame.
13+
*
14+
* Basic use involves replacing and alloc_pages() calls with page_pool_alloc(),
15+
* which allocate memory with or without page splitting depending on the
16+
* requested memory size.
17+
*
18+
* If the driver knows that it always requires full pages or its allocations are
19+
* always smaller than half a page, it can use one of the more specific API
20+
* calls:
21+
*
22+
* 1. page_pool_alloc_pages(): allocate memory without page splitting when
23+
* driver knows that the memory it need is always bigger than half of the page
24+
* allocated from page pool. There is no cache line dirtying for 'struct page'
25+
* when a page is recycled back to the page pool.
26+
*
27+
* 2. page_pool_alloc_frag(): allocate memory with page splitting when driver
28+
* knows that the memory it need is always smaller than or equal to half of the
29+
* page allocated from page pool. Page splitting enables memory saving and thus
30+
* avoids TLB/cache miss for data access, but there also is some cost to
31+
* implement page splitting, mainly some cache line dirtying/bouncing for
32+
* 'struct page' and atomic operation for page->pp_frag_count.
33+
*
34+
* The API keeps track of in-flight pages, in order to let API users know when
35+
* it is safe to free a page_pool object, the API users must call
36+
* page_pool_put_page() or page_pool_free_va() to free the page_pool object, or
37+
* attach the page_pool object to a page_pool-aware object like skbs marked with
2338
* skb_mark_for_recycle().
2439
*
25-
* API users must call page_pool_put_page() once on a page, as it
26-
* will either recycle the page, or in case of refcnt > 1, it will
27-
* release the DMA mapping and in-flight state accounting.
40+
* page_pool_put_page() may be called multi times on the same page if a page is
41+
* split into multi fragments. For the last fragment, it will either recycle the
42+
* page, or in case of page->_refcount > 1, it will release the DMA mapping and
43+
* in-flight state accounting.
44+
*
45+
* dma_sync_single_range_for_device() is only called for the last fragment when
46+
* page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the
47+
* last freed fragment to do the sync_for_device operation for all fragments in
48+
* the same page when a page is split, the API user must setup pool->p.max_len
49+
* and pool->p.offset correctly and ensure that page_pool_put_page() is called
50+
* with dma_sync_size being -1 for fragment API.
2851
*/
2952
#ifndef _NET_PAGE_POOL_HELPERS_H
3053
#define _NET_PAGE_POOL_HELPERS_H
@@ -73,6 +96,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
7396
return page_pool_alloc_pages(pool, gfp);
7497
}
7598

99+
/**
100+
* page_pool_dev_alloc_frag() - allocate a page fragment.
101+
* @pool: pool from which to allocate
102+
* @offset: offset to the allocated page
103+
* @size: requested size
104+
*
105+
* Get a page fragment from the page allocator or page_pool caches.
106+
*
107+
* Return:
108+
* Return allocated page fragment, otherwise return NULL.
109+
*/
76110
static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
77111
unsigned int *offset,
78112
unsigned int size)
@@ -82,6 +116,91 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
82116
return page_pool_alloc_frag(pool, offset, size, gfp);
83117
}
84118

119+
static inline struct page *page_pool_alloc(struct page_pool *pool,
120+
unsigned int *offset,
121+
unsigned int *size, gfp_t gfp)
122+
{
123+
unsigned int max_size = PAGE_SIZE << pool->p.order;
124+
struct page *page;
125+
126+
if ((*size << 1) > max_size) {
127+
*size = max_size;
128+
*offset = 0;
129+
return page_pool_alloc_pages(pool, gfp);
130+
}
131+
132+
page = page_pool_alloc_frag(pool, offset, *size, gfp);
133+
if (unlikely(!page))
134+
return NULL;
135+
136+
/* There is very likely not enough space for another fragment, so append
137+
* the remaining size to the current fragment to avoid truesize
138+
* underestimate problem.
139+
*/
140+
if (pool->frag_offset + *size > max_size) {
141+
*size = max_size - *offset;
142+
pool->frag_offset = max_size;
143+
}
144+
145+
return page;
146+
}
147+
148+
/**
149+
* page_pool_dev_alloc() - allocate a page or a page fragment.
150+
* @pool: pool from which to allocate
151+
* @offset: offset to the allocated page
152+
* @size: in as the requested size, out as the allocated size
153+
*
154+
* Get a page or a page fragment from the page allocator or page_pool caches
155+
* depending on the requested size in order to allocate memory with least memory
156+
* utilization and performance penalty.
157+
*
158+
* Return:
159+
* Return allocated page or page fragment, otherwise return NULL.
160+
*/
161+
static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
162+
unsigned int *offset,
163+
unsigned int *size)
164+
{
165+
gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
166+
167+
return page_pool_alloc(pool, offset, size, gfp);
168+
}
169+
170+
static inline void *page_pool_alloc_va(struct page_pool *pool,
171+
unsigned int *size, gfp_t gfp)
172+
{
173+
unsigned int offset;
174+
struct page *page;
175+
176+
/* Mask off __GFP_HIGHMEM to ensure we can use page_address() */
177+
page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM);
178+
if (unlikely(!page))
179+
return NULL;
180+
181+
return page_address(page) + offset;
182+
}
183+
184+
/**
185+
* page_pool_dev_alloc_va() - allocate a page or a page fragment and return its
186+
* va.
187+
* @pool: pool from which to allocate
188+
* @size: in as the requested size, out as the allocated size
189+
*
190+
* This is just a thin wrapper around the page_pool_alloc() API, and
191+
* it returns va of the allocated page or page fragment.
192+
*
193+
* Return:
194+
* Return the va for the allocated page or page fragment, otherwise return NULL.
195+
*/
196+
static inline void *page_pool_dev_alloc_va(struct page_pool *pool,
197+
unsigned int *size)
198+
{
199+
gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
200+
201+
return page_pool_alloc_va(pool, size, gfp);
202+
}
203+
85204
/**
86205
* page_pool_get_dma_dir() - Retrieve the stored DMA direction.
87206
* @pool: pool from which page was allocated
@@ -115,28 +234,49 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
115234
long ret;
116235

117236
/* If nr == pp_frag_count then we have cleared all remaining
118-
* references to the page. No need to actually overwrite it, instead
119-
* we can leave this to be overwritten by the calling function.
237+
* references to the page:
238+
* 1. 'n == 1': no need to actually overwrite it.
239+
* 2. 'n != 1': overwrite it with one, which is the rare case
240+
* for pp_frag_count draining.
120241
*
121-
* The main advantage to doing this is that an atomic_read is
122-
* generally a much cheaper operation than an atomic update,
123-
* especially when dealing with a page that may be partitioned
124-
* into only 2 or 3 pieces.
242+
* The main advantage to doing this is that not only we avoid a atomic
243+
* update, as an atomic_read is generally a much cheaper operation than
244+
* an atomic update, especially when dealing with a page that may be
245+
* partitioned into only 2 or 3 pieces; but also unify the pp_frag_count
246+
* handling by ensuring all pages have partitioned into only 1 piece
247+
* initially, and only overwrite it when the page is partitioned into
248+
* more than one piece.
125249
*/
126-
if (atomic_long_read(&page->pp_frag_count) == nr)
250+
if (atomic_long_read(&page->pp_frag_count) == nr) {
251+
/* As we have ensured nr is always one for constant case using
252+
* the BUILD_BUG_ON(), only need to handle the non-constant case
253+
* here for pp_frag_count draining, which is a rare case.
254+
*/
255+
BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
256+
if (!__builtin_constant_p(nr))
257+
atomic_long_set(&page->pp_frag_count, 1);
258+
127259
return 0;
260+
}
128261

129262
ret = atomic_long_sub_return(nr, &page->pp_frag_count);
130263
WARN_ON(ret < 0);
264+
265+
/* We are the last user here too, reset pp_frag_count back to 1 to
266+
* ensure all pages have been partitioned into 1 piece initially,
267+
* this should be the rare case when the last two fragment users call
268+
* page_pool_defrag_page() currently.
269+
*/
270+
if (unlikely(!ret))
271+
atomic_long_set(&page->pp_frag_count, 1);
272+
131273
return ret;
132274
}
133275

134-
static inline bool page_pool_is_last_frag(struct page_pool *pool,
135-
struct page *page)
276+
static inline bool page_pool_is_last_frag(struct page *page)
136277
{
137-
/* If fragments aren't enabled or count is 0 we were the last user */
138-
return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
139-
(page_pool_defrag_page(page, 1) == 0);
278+
/* If page_pool_defrag_page() returns 0, we were the last user */
279+
return page_pool_defrag_page(page, 1) == 0;
140280
}
141281

142282
/**
@@ -161,7 +301,7 @@ static inline void page_pool_put_page(struct page_pool *pool,
161301
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
162302
*/
163303
#ifdef CONFIG_PAGE_POOL
164-
if (!page_pool_is_last_frag(pool, page))
304+
if (!page_pool_is_last_frag(page))
165305
return;
166306

167307
page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
@@ -200,6 +340,20 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
200340
#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \
201341
(sizeof(dma_addr_t) > sizeof(unsigned long))
202342

343+
/**
344+
* page_pool_free_va() - free a va into the page_pool
345+
* @pool: pool from which va was allocated
346+
* @va: va to be freed
347+
* @allow_direct: freed by the consumer, allow lockless caching
348+
*
349+
* Free a va allocated from page_pool_allo_va().
350+
*/
351+
static inline void page_pool_free_va(struct page_pool *pool, void *va,
352+
bool allow_direct)
353+
{
354+
page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct);
355+
}
356+
203357
/**
204358
* page_pool_get_dma_addr() - Retrieve the stored DMA address.
205359
* @page: page allocated from a page pool

0 commit comments

Comments
 (0)