Skip to content

Commit aa00f67

Browse files
committed
io_uring: add support for fixed wait regions
Generally applications have 1 or a few waits of waiting, yet they pass in a struct io_uring_getevents_arg every time. This needs to get copied and, in turn, the timeout value needs to get copied. Rather than do this for every invocation, allow the application to register a fixed set of wait regions that can simply be indexed when asking the kernel to wait on events. At ring setup time, the application can register a number of these wait regions and initialize region/index 0 upfront: struct io_uring_reg_wait *reg; reg = io_uring_setup_reg_wait(ring, nr_regions, &ret); /* set timeout and mark as set, sigmask/sigmask_sz as needed */ reg->ts.tv_sec = 0; reg->ts.tv_nsec = 100000; reg->flags = IORING_REG_WAIT_TS; where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The above initializes index 0, but 63 other regions can be initialized, if needed. Now, instead of doing: struct __kernel_timespec timeout = { .tv_nsec = 100000, }; io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL); to wait for events for each submit_and_wait, or just wait, operation, it can just reference the above region at offset 0 and do: io_uring_submit_and_wait_reg(ring, &cqe, nr, 0); to achieve the same goal of waiting 100usec without needing to copy both struct io_uring_getevents_arg (24b) and struct __kernel_timeout (16b) for each invocation. Struct io_uring_reg_wait looks as follows: struct io_uring_reg_wait { struct __kernel_timespec ts; __u32 min_wait_usec; __u32 flags; __u64 sigmask; __u32 sigmask_sz; __u32 pad[3]; __u64 pad2[2]; }; embedding the timeout itself in the region, rather than passing it as a pointer as well. Note that the signal mask is still passed as a pointer, both for compatability reasons, but also because there doesn't seem to be a lot of high frequency waits scenarios that involve setting and resetting the signal mask for each wait. The application is free to modify any region before a wait call, or it can use keep multiple regions with different settings to avoid needing to modify the same one for wait calls. Up to a page size of regions is mapped by default, allowing PAGE_SIZE / 64 available regions for use. The registered region must fit within a page. On a 4kb page size system, that allows for 64 wait regions if a full page is used, as the size of struct io_uring_reg_wait is 64b. The region registered must be aligned to io_uring_reg_wait in size. It's valid to register less than 64 entries. In network performance testing with zero-copy, this reduced the time spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4% to 0.3%. Wait regions are fixed for the lifetime of the ring - once registered, they are persistent until the ring is torn down. The regions support minimum wait timeout as well as the regular waits. Signed-off-by: Jens Axboe <[email protected]>
1 parent 371b47d commit aa00f67

File tree

5 files changed

+191
-11
lines changed

5 files changed

+191
-11
lines changed

include/linux/io_uring_types.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,14 @@ struct io_ring_ctx {
327327
atomic_t cq_wait_nr;
328328
atomic_t cq_timeouts;
329329
struct wait_queue_head cq_wait;
330+
331+
/*
332+
* If registered with IORING_REGISTER_CQWAIT_REG, a single
333+
* page holds N entries, mapped in cq_wait_arg. cq_wait_index
334+
* is the maximum allowable index.
335+
*/
336+
struct io_uring_reg_wait *cq_wait_arg;
337+
unsigned char cq_wait_index;
330338
} ____cacheline_aligned_in_smp;
331339

332340
/* timeouts */
@@ -430,6 +438,8 @@ struct io_ring_ctx {
430438
unsigned short n_sqe_pages;
431439
struct page **ring_pages;
432440
struct page **sqe_pages;
441+
442+
struct page **cq_wait_page;
433443
};
434444

435445
struct io_tw_state {

include/uapi/linux/io_uring.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@ struct io_cqring_offsets {
518518
#define IORING_ENTER_EXT_ARG (1U << 3)
519519
#define IORING_ENTER_REGISTERED_RING (1U << 4)
520520
#define IORING_ENTER_ABS_TIMER (1U << 5)
521+
#define IORING_ENTER_EXT_ARG_REG (1U << 6)
521522

522523
/*
523524
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -620,6 +621,9 @@ enum io_uring_register_op {
620621
/* resize CQ ring */
621622
IORING_REGISTER_RESIZE_RINGS = 33,
622623

624+
/* register fixed io_uring_reg_wait arguments */
625+
IORING_REGISTER_CQWAIT_REG = 34,
626+
623627
/* this goes last */
624628
IORING_REGISTER_LAST,
625629

@@ -803,6 +807,43 @@ enum io_uring_register_restriction_op {
803807
IORING_RESTRICTION_LAST
804808
};
805809

810+
enum {
811+
IORING_REG_WAIT_TS = (1U << 0),
812+
};
813+
814+
/*
815+
* Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
816+
* struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
817+
* called rather than pass in a wait argument structure separately.
818+
*/
819+
struct io_uring_cqwait_reg_arg {
820+
__u32 flags;
821+
__u32 struct_size;
822+
__u32 nr_entries;
823+
__u32 pad;
824+
__u64 user_addr;
825+
__u64 pad2[3];
826+
};
827+
828+
/*
829+
* Argument for io_uring_enter(2) with
830+
* IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
831+
* is an index into a previously registered fixed wait region described by
832+
* the below structure.
833+
*/
834+
struct io_uring_reg_wait {
835+
struct __kernel_timespec ts;
836+
__u32 min_wait_usec;
837+
__u32 flags;
838+
__u64 sigmask;
839+
__u32 sigmask_sz;
840+
__u32 pad[3];
841+
__u64 pad2[2];
842+
};
843+
844+
/*
845+
* Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
846+
*/
806847
struct io_uring_getevents_arg {
807848
__u64 sigmask;
808849
__u32 sigmask_sz;

io_uring/io_uring.c

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2736,6 +2736,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
27362736
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
27372737
io_futex_cache_free(ctx);
27382738
io_destroy_buffers(ctx);
2739+
io_unregister_cqwait_reg(ctx);
27392740
mutex_unlock(&ctx->uring_lock);
27402741
if (ctx->sq_creds)
27412742
put_cred(ctx->sq_creds);
@@ -3224,21 +3225,43 @@ void __io_uring_cancel(bool cancel_all)
32243225
io_uring_cancel_generic(cancel_all, NULL);
32253226
}
32263227

3227-
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
3228+
static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
3229+
const struct io_uring_getevents_arg __user *uarg)
32283230
{
3229-
if (flags & IORING_ENTER_EXT_ARG) {
3230-
struct io_uring_getevents_arg arg;
3231+
struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
32313232

3232-
if (argsz != sizeof(arg))
3233+
if (arg) {
3234+
unsigned int index = (unsigned int) (uintptr_t) uarg;
3235+
3236+
if (index <= ctx->cq_wait_index)
3237+
return arg + index;
3238+
}
3239+
3240+
return ERR_PTR(-EFAULT);
3241+
}
3242+
3243+
static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
3244+
const void __user *argp, size_t argsz)
3245+
{
3246+
struct io_uring_getevents_arg arg;
3247+
3248+
if (!(flags & IORING_ENTER_EXT_ARG))
3249+
return 0;
3250+
3251+
if (flags & IORING_ENTER_EXT_ARG_REG) {
3252+
if (argsz != sizeof(struct io_uring_reg_wait))
32333253
return -EINVAL;
3234-
if (copy_from_user(&arg, argp, sizeof(arg)))
3235-
return -EFAULT;
3254+
return PTR_ERR(io_get_ext_arg_reg(ctx, argp));
32363255
}
3256+
if (argsz != sizeof(arg))
3257+
return -EINVAL;
3258+
if (copy_from_user(&arg, argp, sizeof(arg)))
3259+
return -EFAULT;
32373260
return 0;
32383261
}
32393262

3240-
static int io_get_ext_arg(unsigned flags, const void __user *argp,
3241-
struct ext_arg *ext_arg)
3263+
static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
3264+
const void __user *argp, struct ext_arg *ext_arg)
32423265
{
32433266
const struct io_uring_getevents_arg __user *uarg = argp;
32443267
struct io_uring_getevents_arg arg;
@@ -3252,6 +3275,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
32523275
return 0;
32533276
}
32543277

3278+
if (flags & IORING_ENTER_EXT_ARG_REG) {
3279+
struct io_uring_reg_wait *w;
3280+
3281+
if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
3282+
return -EINVAL;
3283+
w = io_get_ext_arg_reg(ctx, argp);
3284+
if (IS_ERR(w))
3285+
return PTR_ERR(w);
3286+
3287+
if (w->flags & ~IORING_REG_WAIT_TS)
3288+
return -EINVAL;
3289+
ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
3290+
ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
3291+
ext_arg->argsz = READ_ONCE(w->sigmask_sz);
3292+
if (w->flags & IORING_REG_WAIT_TS) {
3293+
ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
3294+
ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
3295+
ext_arg->ts_set = true;
3296+
}
3297+
return 0;
3298+
}
3299+
32553300
/*
32563301
* EXT_ARG is set - ensure we agree on the size of it and copy in our
32573302
* timespec and sigset_t pointers if good.
@@ -3297,7 +3342,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
32973342
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
32983343
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
32993344
IORING_ENTER_REGISTERED_RING |
3300-
IORING_ENTER_ABS_TIMER)))
3345+
IORING_ENTER_ABS_TIMER |
3346+
IORING_ENTER_EXT_ARG_REG)))
33013347
return -EINVAL;
33023348

33033349
/*
@@ -3380,7 +3426,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
33803426
*/
33813427
mutex_lock(&ctx->uring_lock);
33823428
iopoll_locked:
3383-
ret2 = io_validate_ext_arg(flags, argp, argsz);
3429+
ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
33843430
if (likely(!ret2)) {
33853431
min_complete = min(min_complete,
33863432
ctx->cq_entries);
@@ -3390,7 +3436,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
33903436
} else {
33913437
struct ext_arg ext_arg = { .argsz = argsz };
33923438

3393-
ret2 = io_get_ext_arg(flags, argp, &ext_arg);
3439+
ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
33943440
if (likely(!ret2)) {
33953441
min_complete = min(min_complete,
33963442
ctx->cq_entries);

io_uring/register.c

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,82 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
570570
return ret;
571571
}
572572

573+
void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
574+
{
575+
unsigned short npages = 1;
576+
577+
if (!ctx->cq_wait_page)
578+
return;
579+
580+
io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
581+
ctx->cq_wait_arg = NULL;
582+
if (ctx->user)
583+
__io_unaccount_mem(ctx->user, 1);
584+
}
585+
586+
/*
587+
* Register a page holding N entries of struct io_uring_reg_wait, which can
588+
* be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
589+
* If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
590+
* in a pointer for a struct io_uring_getevents_arg, an index into this
591+
* registered array is passed, avoiding two (arg + timeout) copies per
592+
* invocation.
593+
*/
594+
static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
595+
{
596+
struct io_uring_cqwait_reg_arg arg;
597+
struct io_uring_reg_wait *reg;
598+
struct page **pages;
599+
unsigned long len;
600+
int nr_pages, poff;
601+
int ret;
602+
603+
if (ctx->cq_wait_page || ctx->cq_wait_arg)
604+
return -EBUSY;
605+
if (copy_from_user(&arg, uarg, sizeof(arg)))
606+
return -EFAULT;
607+
if (!arg.nr_entries || arg.flags)
608+
return -EINVAL;
609+
if (arg.struct_size != sizeof(*reg))
610+
return -EINVAL;
611+
if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
612+
return -EOVERFLOW;
613+
if (len > PAGE_SIZE)
614+
return -EINVAL;
615+
/* offset + len must fit within a page, and must be reg_wait aligned */
616+
poff = arg.user_addr & ~PAGE_MASK;
617+
if (len + poff > PAGE_SIZE)
618+
return -EINVAL;
619+
if (poff % arg.struct_size)
620+
return -EINVAL;
621+
622+
pages = io_pin_pages(arg.user_addr, len, &nr_pages);
623+
if (IS_ERR(pages))
624+
return PTR_ERR(pages);
625+
ret = -EINVAL;
626+
if (nr_pages != 1)
627+
goto out_free;
628+
if (ctx->user) {
629+
ret = __io_account_mem(ctx->user, 1);
630+
if (ret)
631+
goto out_free;
632+
}
633+
634+
reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
635+
if (reg) {
636+
ctx->cq_wait_index = arg.nr_entries - 1;
637+
WRITE_ONCE(ctx->cq_wait_page, pages);
638+
WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
639+
return 0;
640+
}
641+
ret = -ENOMEM;
642+
if (ctx->user)
643+
__io_unaccount_mem(ctx->user, 1);
644+
out_free:
645+
io_pages_free(&pages, nr_pages);
646+
return ret;
647+
}
648+
573649
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
574650
void __user *arg, unsigned nr_args)
575651
__releases(ctx->uring_lock)
@@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
764840
break;
765841
ret = io_register_resize_rings(ctx, arg);
766842
break;
843+
case IORING_REGISTER_CQWAIT_REG:
844+
ret = -EINVAL;
845+
if (!arg || nr_args != 1)
846+
break;
847+
ret = io_register_cqwait_reg(ctx, arg);
848+
break;
767849
default:
768850
ret = -EINVAL;
769851
break;

io_uring/register.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55
int io_eventfd_unregister(struct io_ring_ctx *ctx);
66
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
77
struct file *io_uring_register_get_file(unsigned int fd, bool registered);
8+
void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
89

910
#endif

0 commit comments

Comments
 (0)