Skip to content

Commit 79cfe9e

Browse files
committed
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed. Usually this isn't a problem on the SQ ring side, as it merely controls the available number of requests that can be submitted in a single system call, and there's rarely a need to change that. For the CQ ring, it's a different story. For most efficient use of io_uring, it's important that the CQ ring never overflows. This means that applications must size it for the worst case scenario, which can be wasteful. Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize the existing rings. It takes a struct io_uring_params argument, the same one which is used to setup the ring initially, and resizes rings according to the sizes given. Certain properties are always inherited from the original ring setup, like SQE128/CQE32 and other setup options. The implementation only allows flag associated with how the CQ ring is sized and clamped. Existing unconsumed SQE and CQE entries are copied as part of the process. If either the SQ or CQ resized destination ring cannot hold the entries already present in the source rings, then the operation is failed with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new submissions, and the internal mapping holds the completion lock as well across moving CQ ring state. To prevent races between mmap and ring resizing, add a mutex that's solely used to serialize ring resize and mmap. mmap_sem can't be used here, as as fork'ed process may be doing mmaps on the ring as well. The ctx->resize_lock is held across mmap operations, and the resize will grab it before swapping out the already mapped new data. Signed-off-by: Jens Axboe <[email protected]>
1 parent d090bff commit 79cfe9e

File tree

5 files changed

+236
-0
lines changed

5 files changed

+236
-0
lines changed

include/linux/io_uring_types.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,13 @@ struct io_ring_ctx {
415415
/* protected by ->completion_lock */
416416
unsigned evfd_last_cq_tail;
417417

418+
/*
419+
* Protection for resize vs mmap races - both the mmap and resize
420+
* side will need to grab this lock, to prevent either side from
421+
* being run concurrently with the other.
422+
*/
423+
struct mutex resize_lock;
424+
418425
/*
419426
* If IORING_SETUP_NO_MMAP is used, then the below holds
420427
* the gup'ed pages for the two rings, and the sqes.

include/uapi/linux/io_uring.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,11 @@ enum io_uring_register_op {
615615
/* send MSG_RING without having a ring */
616616
IORING_REGISTER_SEND_MSG_RING = 31,
617617

618+
/* 32 reserved for zc rx */
619+
620+
/* resize CQ ring */
621+
IORING_REGISTER_RESIZE_RINGS = 33,
622+
618623
/* this goes last */
619624
IORING_REGISTER_LAST,
620625

io_uring/io_uring.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
353353
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
354354
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
355355
io_napi_init(ctx);
356+
mutex_init(&ctx->resize_lock);
356357

357358
return ctx;
358359

io_uring/memmap.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
251251
unsigned int npages;
252252
void *ptr;
253253

254+
guard(mutex)(&ctx->resize_lock);
255+
254256
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
255257
if (IS_ERR(ptr))
256258
return PTR_ERR(ptr);
@@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
274276
unsigned long len, unsigned long pgoff,
275277
unsigned long flags)
276278
{
279+
struct io_ring_ctx *ctx = filp->private_data;
277280
void *ptr;
278281

279282
/*
@@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
284287
if (addr)
285288
return -EINVAL;
286289

290+
guard(mutex)(&ctx->resize_lock);
291+
287292
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
288293
if (IS_ERR(ptr))
289294
return -ENOMEM;
@@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
329334
unsigned long len, unsigned long pgoff,
330335
unsigned long flags)
331336
{
337+
struct io_ring_ctx *ctx = file->private_data;
332338
void *ptr;
333339

340+
guard(mutex)(&ctx->resize_lock);
341+
334342
ptr = io_uring_validate_mmap_request(file, pgoff, len);
335343
if (IS_ERR(ptr))
336344
return PTR_ERR(ptr);

io_uring/register.c

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "napi.h"
3030
#include "eventfd.h"
3131
#include "msg_ring.h"
32+
#include "memmap.h"
3233

3334
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
3435
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx,
361362
return 0;
362363
}
363364

365+
/*
366+
* State to maintain until we can swap. Both new and old state, used for
367+
* either mapping or freeing.
368+
*/
369+
struct io_ring_ctx_rings {
370+
unsigned short n_ring_pages;
371+
unsigned short n_sqe_pages;
372+
struct page **ring_pages;
373+
struct page **sqe_pages;
374+
struct io_uring_sqe *sq_sqes;
375+
struct io_rings *rings;
376+
};
377+
378+
static void io_register_free_rings(struct io_uring_params *p,
379+
struct io_ring_ctx_rings *r)
380+
{
381+
if (!(p->flags & IORING_SETUP_NO_MMAP)) {
382+
io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
383+
true);
384+
io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
385+
true);
386+
} else {
387+
io_pages_free(&r->ring_pages, r->n_ring_pages);
388+
io_pages_free(&r->sqe_pages, r->n_sqe_pages);
389+
vunmap(r->rings);
390+
vunmap(r->sq_sqes);
391+
}
392+
}
393+
394+
#define swap_old(ctx, o, n, field) \
395+
do { \
396+
(o).field = (ctx)->field; \
397+
(ctx)->field = (n).field; \
398+
} while (0)
399+
400+
#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
401+
#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
402+
IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
403+
404+
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
405+
{
406+
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
407+
size_t size, sq_array_offset;
408+
struct io_uring_params p;
409+
unsigned i, tail;
410+
void *ptr;
411+
int ret;
412+
413+
/* for single issuer, must be owner resizing */
414+
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
415+
current != ctx->submitter_task)
416+
return -EEXIST;
417+
if (copy_from_user(&p, arg, sizeof(p)))
418+
return -EFAULT;
419+
if (p.flags & ~RESIZE_FLAGS)
420+
return -EINVAL;
421+
422+
/* properties that are always inherited */
423+
p.flags |= (ctx->flags & COPY_FLAGS);
424+
425+
ret = io_uring_fill_params(p.sq_entries, &p);
426+
if (unlikely(ret))
427+
return ret;
428+
429+
/* nothing to do, but copy params back */
430+
if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
431+
if (copy_to_user(arg, &p, sizeof(p)))
432+
return -EFAULT;
433+
return 0;
434+
}
435+
436+
size = rings_size(p.flags, p.sq_entries, p.cq_entries,
437+
&sq_array_offset);
438+
if (size == SIZE_MAX)
439+
return -EOVERFLOW;
440+
441+
if (!(p.flags & IORING_SETUP_NO_MMAP))
442+
n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
443+
else
444+
n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
445+
p.cq_off.user_addr, size);
446+
if (IS_ERR(n.rings))
447+
return PTR_ERR(n.rings);
448+
449+
n.rings->sq_ring_mask = p.sq_entries - 1;
450+
n.rings->cq_ring_mask = p.cq_entries - 1;
451+
n.rings->sq_ring_entries = p.sq_entries;
452+
n.rings->cq_ring_entries = p.cq_entries;
453+
454+
if (copy_to_user(arg, &p, sizeof(p))) {
455+
io_register_free_rings(&p, &n);
456+
return -EFAULT;
457+
}
458+
459+
if (p.flags & IORING_SETUP_SQE128)
460+
size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
461+
else
462+
size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
463+
if (size == SIZE_MAX) {
464+
io_register_free_rings(&p, &n);
465+
return -EOVERFLOW;
466+
}
467+
468+
if (!(p.flags & IORING_SETUP_NO_MMAP))
469+
ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
470+
else
471+
ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
472+
p.sq_off.user_addr,
473+
size);
474+
if (IS_ERR(ptr)) {
475+
io_register_free_rings(&p, &n);
476+
return PTR_ERR(ptr);
477+
}
478+
479+
/*
480+
* If using SQPOLL, park the thread
481+
*/
482+
if (ctx->sq_data) {
483+
mutex_unlock(&ctx->uring_lock);
484+
io_sq_thread_park(ctx->sq_data);
485+
mutex_lock(&ctx->uring_lock);
486+
}
487+
488+
/*
489+
* We'll do the swap. Grab the ctx->resize_lock, which will exclude
490+
* any new mmap's on the ring fd. Clear out existing mappings to prevent
491+
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
492+
* existing rings beyond this point will fail. Not that it could proceed
493+
* at this point anyway, as the io_uring mmap side needs go grab the
494+
* ctx->resize_lock as well. Likewise, hold the completion lock over the
495+
* duration of the actual swap.
496+
*/
497+
mutex_lock(&ctx->resize_lock);
498+
spin_lock(&ctx->completion_lock);
499+
o.rings = ctx->rings;
500+
ctx->rings = NULL;
501+
o.sq_sqes = ctx->sq_sqes;
502+
ctx->sq_sqes = NULL;
503+
504+
/*
505+
* Now copy SQ and CQ entries, if any. If either of the destination
506+
* rings can't hold what is already there, then fail the operation.
507+
*/
508+
n.sq_sqes = ptr;
509+
tail = o.rings->sq.tail;
510+
if (tail - o.rings->sq.head > p.sq_entries)
511+
goto overflow;
512+
for (i = o.rings->sq.head; i < tail; i++) {
513+
unsigned src_head = i & (ctx->sq_entries - 1);
514+
unsigned dst_head = i & n.rings->sq_ring_mask;
515+
516+
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
517+
}
518+
n.rings->sq.head = o.rings->sq.head;
519+
n.rings->sq.tail = o.rings->sq.tail;
520+
521+
tail = o.rings->cq.tail;
522+
if (tail - o.rings->cq.head > p.cq_entries) {
523+
overflow:
524+
/* restore old rings, and return -EOVERFLOW via cleanup path */
525+
ctx->rings = o.rings;
526+
ctx->sq_sqes = o.sq_sqes;
527+
to_free = &n;
528+
ret = -EOVERFLOW;
529+
goto out;
530+
}
531+
for (i = o.rings->cq.head; i < tail; i++) {
532+
unsigned src_head = i & (ctx->cq_entries - 1);
533+
unsigned dst_head = i & n.rings->cq_ring_mask;
534+
535+
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
536+
}
537+
n.rings->cq.head = o.rings->cq.head;
538+
n.rings->cq.tail = o.rings->cq.tail;
539+
/* invalidate cached cqe refill */
540+
ctx->cqe_cached = ctx->cqe_sentinel = NULL;
541+
542+
n.rings->sq_dropped = o.rings->sq_dropped;
543+
n.rings->sq_flags = o.rings->sq_flags;
544+
n.rings->cq_flags = o.rings->cq_flags;
545+
n.rings->cq_overflow = o.rings->cq_overflow;
546+
547+
/* all done, store old pointers and assign new ones */
548+
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
549+
ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
550+
551+
ctx->sq_entries = p.sq_entries;
552+
ctx->cq_entries = p.cq_entries;
553+
554+
ctx->rings = n.rings;
555+
ctx->sq_sqes = n.sq_sqes;
556+
swap_old(ctx, o, n, n_ring_pages);
557+
swap_old(ctx, o, n, n_sqe_pages);
558+
swap_old(ctx, o, n, ring_pages);
559+
swap_old(ctx, o, n, sqe_pages);
560+
to_free = &o;
561+
ret = 0;
562+
out:
563+
spin_unlock(&ctx->completion_lock);
564+
mutex_unlock(&ctx->resize_lock);
565+
io_register_free_rings(&p, to_free);
566+
567+
if (ctx->sq_data)
568+
io_sq_thread_unpark(ctx->sq_data);
569+
570+
return ret;
571+
}
572+
364573
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
365574
void __user *arg, unsigned nr_args)
366575
__releases(ctx->uring_lock)
@@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
549758
break;
550759
ret = io_register_clone_buffers(ctx, arg);
551760
break;
761+
case IORING_REGISTER_RESIZE_RINGS:
762+
ret = -EINVAL;
763+
if (!arg || nr_args != 1)
764+
break;
765+
ret = io_register_resize_rings(ctx, arg);
766+
break;
552767
default:
553768
ret = -EINVAL;
554769
break;

0 commit comments

Comments
 (0)