Skip to content

Commit dbae2b0

Browse files
Paolo Abenikuba-moo
authored andcommitted
net: skb: introduce and use a single page frag cache
After commit 3226b15 ("net: avoid 32 x truesize under-estimation for tiny skbs") we are observing 10-20% regressions in performance tests with small packets. The perf trace points to high pressure on the slab allocator. This change tries to improve the allocation schema for small packets using an idea originally suggested by Eric: a new per CPU page frag is introduced and used in __napi_alloc_skb to cope with small allocation requests. To ensure that the above does not lead to excessive truesize underestimation, the frag size for small allocation is inflated to 1K and all the above is restricted to build with 4K page size. Note that we need to update accordingly the run-time check introduced with commit fd9ea57 ("net: add napi_get_frags_check() helper"). Alex suggested a smart page refcount schema to reduce the number of atomic operations and deal properly with pfmemalloc pages. Under small packet UDP flood, I measure a 15% peak tput increases. Suggested-by: Eric Dumazet <[email protected]> Suggested-by: Alexander H Duyck <[email protected]> Signed-off-by: Paolo Abeni <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Reviewed-by: Alexander Duyck <[email protected]> Link: https://lore.kernel.org/r/6b6f65957c59f86a353fc09a5127e83a32ab5999.1664350652.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 7cba183 commit dbae2b0

File tree

3 files changed

+104
-22
lines changed

3 files changed

+104
-22
lines changed

include/linux/netdevice.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3821,6 +3821,7 @@ void netif_receive_skb_list(struct list_head *head);
38213821
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
38223822
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
38233823
struct sk_buff *napi_get_frags(struct napi_struct *napi);
3824+
void napi_get_frags_check(struct napi_struct *napi);
38243825
gro_result_t napi_gro_frags(struct napi_struct *napi);
38253826
struct packet_offload *gro_find_receive_by_type(__be16 type);
38263827
struct packet_offload *gro_find_complete_by_type(__be16 type);

net/core/dev.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
63586358
}
63596359
EXPORT_SYMBOL(dev_set_threaded);
63606360

6361-
/* Double check that napi_get_frags() allocates skbs with
6362-
* skb->head being backed by slab, not a page fragment.
6363-
* This is to make sure bug fixed in 3226b158e67c
6364-
* ("net: avoid 32 x truesize under-estimation for tiny skbs")
6365-
* does not accidentally come back.
6366-
*/
6367-
static void napi_get_frags_check(struct napi_struct *napi)
6368-
{
6369-
struct sk_buff *skb;
6370-
6371-
local_bh_disable();
6372-
skb = napi_get_frags(napi);
6373-
WARN_ON_ONCE(skb && skb->head_frag);
6374-
napi_free_frags(napi);
6375-
local_bh_enable();
6376-
}
6377-
63786361
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
63796362
int (*poll)(struct napi_struct *, int), int weight)
63806363
{

net/core/skbuff.c

Lines changed: 103 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,90 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
134134
#define NAPI_SKB_CACHE_BULK 16
135135
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
136136

137+
#if PAGE_SIZE == SZ_4K
138+
139+
#define NAPI_HAS_SMALL_PAGE_FRAG 1
140+
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
141+
142+
/* specialized page frag allocator using a single order 0 page
143+
* and slicing it into 1K sized fragment. Constrained to systems
144+
* with a very limited amount of 1K fragments fitting a single
145+
* page - to avoid excessive truesize underestimation
146+
*/
147+
148+
struct page_frag_1k {
149+
void *va;
150+
u16 offset;
151+
bool pfmemalloc;
152+
};
153+
154+
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
155+
{
156+
struct page *page;
157+
int offset;
158+
159+
offset = nc->offset - SZ_1K;
160+
if (likely(offset >= 0))
161+
goto use_frag;
162+
163+
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
164+
if (!page)
165+
return NULL;
166+
167+
nc->va = page_address(page);
168+
nc->pfmemalloc = page_is_pfmemalloc(page);
169+
offset = PAGE_SIZE - SZ_1K;
170+
page_ref_add(page, offset / SZ_1K);
171+
172+
use_frag:
173+
nc->offset = offset;
174+
return nc->va + offset;
175+
}
176+
#else
177+
178+
/* the small page is actually unused in this build; add dummy helpers
179+
* to please the compiler and avoid later preprocessor's conditionals
180+
*/
181+
#define NAPI_HAS_SMALL_PAGE_FRAG 0
182+
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
183+
184+
struct page_frag_1k {
185+
};
186+
187+
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
188+
{
189+
return NULL;
190+
}
191+
192+
#endif
193+
137194
struct napi_alloc_cache {
138195
struct page_frag_cache page;
196+
struct page_frag_1k page_small;
139197
unsigned int skb_count;
140198
void *skb_cache[NAPI_SKB_CACHE_SIZE];
141199
};
142200

143201
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
144202
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
145203

204+
/* Double check that napi_get_frags() allocates skbs with
205+
* skb->head being backed by slab, not a page fragment.
206+
* This is to make sure bug fixed in 3226b158e67c
207+
* ("net: avoid 32 x truesize under-estimation for tiny skbs")
208+
* does not accidentally come back.
209+
*/
210+
void napi_get_frags_check(struct napi_struct *napi)
211+
{
212+
struct sk_buff *skb;
213+
214+
local_bh_disable();
215+
skb = napi_get_frags(napi);
216+
WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
217+
napi_free_frags(napi);
218+
local_bh_enable();
219+
}
220+
146221
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
147222
{
148223
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -561,15 +636,18 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
561636
{
562637
struct napi_alloc_cache *nc;
563638
struct sk_buff *skb;
639+
bool pfmemalloc;
564640
void *data;
565641

566642
DEBUG_NET_WARN_ON_ONCE(!in_softirq());
567643
len += NET_SKB_PAD + NET_IP_ALIGN;
568644

569645
/* If requested length is either too small or too big,
570646
* we use kmalloc() for skb->head allocation.
647+
* When the small frag allocator is available, prefer it over kmalloc
648+
* for small fragments
571649
*/
572-
if (len <= SKB_WITH_OVERHEAD(1024) ||
650+
if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
573651
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
574652
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
575653
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -580,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
580658
}
581659

582660
nc = this_cpu_ptr(&napi_alloc_cache);
583-
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
584-
len = SKB_DATA_ALIGN(len);
585661

586662
if (sk_memalloc_socks())
587663
gfp_mask |= __GFP_MEMALLOC;
588664

589-
data = page_frag_alloc(&nc->page, len, gfp_mask);
665+
if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
666+
/* we are artificially inflating the allocation size, but
667+
* that is not as bad as it may look like, as:
668+
* - 'len' less than GRO_MAX_HEAD makes little sense
669+
* - On most systems, larger 'len' values lead to fragment
670+
* size above 512 bytes
671+
* - kmalloc would use the kmalloc-1k slab for such values
672+
* - Builds with smaller GRO_MAX_HEAD will very likely do
673+
* little networking, as that implies no WiFi and no
674+
* tunnels support, and 32 bits arches.
675+
*/
676+
len = SZ_1K;
677+
678+
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
679+
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
680+
} else {
681+
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
682+
len = SKB_DATA_ALIGN(len);
683+
684+
data = page_frag_alloc(&nc->page, len, gfp_mask);
685+
pfmemalloc = nc->page.pfmemalloc;
686+
}
687+
590688
if (unlikely(!data))
591689
return NULL;
592690

@@ -596,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
596694
return NULL;
597695
}
598696

599-
if (nc->page.pfmemalloc)
697+
if (pfmemalloc)
600698
skb->pfmemalloc = 1;
601699
skb->head_frag = 1;
602700

0 commit comments

Comments
 (0)