|
29 | 29 | #include "napi.h" |
30 | 30 | #include "eventfd.h" |
31 | 31 | #include "msg_ring.h" |
| 32 | +#include "memmap.h" |
32 | 33 |
|
33 | 34 | #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ |
34 | 35 | IORING_REGISTER_LAST + IORING_OP_LAST) |
@@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx, |
361 | 362 | return 0; |
362 | 363 | } |
363 | 364 |
|
| 365 | +/* |
| 366 | + * State to maintain until we can swap. Both new and old state, used for |
| 367 | + * either mapping or freeing. |
| 368 | + */ |
| 369 | +struct io_ring_ctx_rings { |
| 370 | + unsigned short n_ring_pages; |
| 371 | + unsigned short n_sqe_pages; |
| 372 | + struct page **ring_pages; |
| 373 | + struct page **sqe_pages; |
| 374 | + struct io_uring_sqe *sq_sqes; |
| 375 | + struct io_rings *rings; |
| 376 | +}; |
| 377 | + |
| 378 | +static void io_register_free_rings(struct io_uring_params *p, |
| 379 | + struct io_ring_ctx_rings *r) |
| 380 | +{ |
| 381 | + if (!(p->flags & IORING_SETUP_NO_MMAP)) { |
| 382 | + io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, |
| 383 | + true); |
| 384 | + io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, |
| 385 | + true); |
| 386 | + } else { |
| 387 | + io_pages_free(&r->ring_pages, r->n_ring_pages); |
| 388 | + io_pages_free(&r->sqe_pages, r->n_sqe_pages); |
| 389 | + vunmap(r->rings); |
| 390 | + vunmap(r->sq_sqes); |
| 391 | + } |
| 392 | +} |
| 393 | + |
| 394 | +#define swap_old(ctx, o, n, field) \ |
| 395 | + do { \ |
| 396 | + (o).field = (ctx)->field; \ |
| 397 | + (ctx)->field = (n).field; \ |
| 398 | + } while (0) |
| 399 | + |
| 400 | +#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) |
| 401 | +#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ |
| 402 | + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) |
| 403 | + |
| 404 | +static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) |
| 405 | +{ |
| 406 | + struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; |
| 407 | + size_t size, sq_array_offset; |
| 408 | + struct io_uring_params p; |
| 409 | + unsigned i, tail; |
| 410 | + void *ptr; |
| 411 | + int ret; |
| 412 | + |
| 413 | + /* for single issuer, must be owner resizing */ |
| 414 | + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && |
| 415 | + current != ctx->submitter_task) |
| 416 | + return -EEXIST; |
| 417 | + if (copy_from_user(&p, arg, sizeof(p))) |
| 418 | + return -EFAULT; |
| 419 | + if (p.flags & ~RESIZE_FLAGS) |
| 420 | + return -EINVAL; |
| 421 | + |
| 422 | + /* properties that are always inherited */ |
| 423 | + p.flags |= (ctx->flags & COPY_FLAGS); |
| 424 | + |
| 425 | + ret = io_uring_fill_params(p.sq_entries, &p); |
| 426 | + if (unlikely(ret)) |
| 427 | + return ret; |
| 428 | + |
| 429 | + /* nothing to do, but copy params back */ |
| 430 | + if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { |
| 431 | + if (copy_to_user(arg, &p, sizeof(p))) |
| 432 | + return -EFAULT; |
| 433 | + return 0; |
| 434 | + } |
| 435 | + |
| 436 | + size = rings_size(p.flags, p.sq_entries, p.cq_entries, |
| 437 | + &sq_array_offset); |
| 438 | + if (size == SIZE_MAX) |
| 439 | + return -EOVERFLOW; |
| 440 | + |
| 441 | + if (!(p.flags & IORING_SETUP_NO_MMAP)) |
| 442 | + n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); |
| 443 | + else |
| 444 | + n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, |
| 445 | + p.cq_off.user_addr, size); |
| 446 | + if (IS_ERR(n.rings)) |
| 447 | + return PTR_ERR(n.rings); |
| 448 | + |
| 449 | + n.rings->sq_ring_mask = p.sq_entries - 1; |
| 450 | + n.rings->cq_ring_mask = p.cq_entries - 1; |
| 451 | + n.rings->sq_ring_entries = p.sq_entries; |
| 452 | + n.rings->cq_ring_entries = p.cq_entries; |
| 453 | + |
| 454 | + if (copy_to_user(arg, &p, sizeof(p))) { |
| 455 | + io_register_free_rings(&p, &n); |
| 456 | + return -EFAULT; |
| 457 | + } |
| 458 | + |
| 459 | + if (p.flags & IORING_SETUP_SQE128) |
| 460 | + size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); |
| 461 | + else |
| 462 | + size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); |
| 463 | + if (size == SIZE_MAX) { |
| 464 | + io_register_free_rings(&p, &n); |
| 465 | + return -EOVERFLOW; |
| 466 | + } |
| 467 | + |
| 468 | + if (!(p.flags & IORING_SETUP_NO_MMAP)) |
| 469 | + ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); |
| 470 | + else |
| 471 | + ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, |
| 472 | + p.sq_off.user_addr, |
| 473 | + size); |
| 474 | + if (IS_ERR(ptr)) { |
| 475 | + io_register_free_rings(&p, &n); |
| 476 | + return PTR_ERR(ptr); |
| 477 | + } |
| 478 | + |
| 479 | + /* |
| 480 | + * If using SQPOLL, park the thread |
| 481 | + */ |
| 482 | + if (ctx->sq_data) { |
| 483 | + mutex_unlock(&ctx->uring_lock); |
| 484 | + io_sq_thread_park(ctx->sq_data); |
| 485 | + mutex_lock(&ctx->uring_lock); |
| 486 | + } |
| 487 | + |
| 488 | + /* |
| 489 | + * We'll do the swap. Grab the ctx->resize_lock, which will exclude |
| 490 | + * any new mmap's on the ring fd. Clear out existing mappings to prevent |
| 491 | + * mmap from seeing them, as we'll unmap them. Any attempt to mmap |
| 492 | + * existing rings beyond this point will fail. Not that it could proceed |
| 493 | + * at this point anyway, as the io_uring mmap side needs go grab the |
| 494 | + * ctx->resize_lock as well. Likewise, hold the completion lock over the |
| 495 | + * duration of the actual swap. |
| 496 | + */ |
| 497 | + mutex_lock(&ctx->resize_lock); |
| 498 | + spin_lock(&ctx->completion_lock); |
| 499 | + o.rings = ctx->rings; |
| 500 | + ctx->rings = NULL; |
| 501 | + o.sq_sqes = ctx->sq_sqes; |
| 502 | + ctx->sq_sqes = NULL; |
| 503 | + |
| 504 | + /* |
| 505 | + * Now copy SQ and CQ entries, if any. If either of the destination |
| 506 | + * rings can't hold what is already there, then fail the operation. |
| 507 | + */ |
| 508 | + n.sq_sqes = ptr; |
| 509 | + tail = o.rings->sq.tail; |
| 510 | + if (tail - o.rings->sq.head > p.sq_entries) |
| 511 | + goto overflow; |
| 512 | + for (i = o.rings->sq.head; i < tail; i++) { |
| 513 | + unsigned src_head = i & (ctx->sq_entries - 1); |
| 514 | + unsigned dst_head = i & n.rings->sq_ring_mask; |
| 515 | + |
| 516 | + n.sq_sqes[dst_head] = o.sq_sqes[src_head]; |
| 517 | + } |
| 518 | + n.rings->sq.head = o.rings->sq.head; |
| 519 | + n.rings->sq.tail = o.rings->sq.tail; |
| 520 | + |
| 521 | + tail = o.rings->cq.tail; |
| 522 | + if (tail - o.rings->cq.head > p.cq_entries) { |
| 523 | +overflow: |
| 524 | + /* restore old rings, and return -EOVERFLOW via cleanup path */ |
| 525 | + ctx->rings = o.rings; |
| 526 | + ctx->sq_sqes = o.sq_sqes; |
| 527 | + to_free = &n; |
| 528 | + ret = -EOVERFLOW; |
| 529 | + goto out; |
| 530 | + } |
| 531 | + for (i = o.rings->cq.head; i < tail; i++) { |
| 532 | + unsigned src_head = i & (ctx->cq_entries - 1); |
| 533 | + unsigned dst_head = i & n.rings->cq_ring_mask; |
| 534 | + |
| 535 | + n.rings->cqes[dst_head] = o.rings->cqes[src_head]; |
| 536 | + } |
| 537 | + n.rings->cq.head = o.rings->cq.head; |
| 538 | + n.rings->cq.tail = o.rings->cq.tail; |
| 539 | + /* invalidate cached cqe refill */ |
| 540 | + ctx->cqe_cached = ctx->cqe_sentinel = NULL; |
| 541 | + |
| 542 | + n.rings->sq_dropped = o.rings->sq_dropped; |
| 543 | + n.rings->sq_flags = o.rings->sq_flags; |
| 544 | + n.rings->cq_flags = o.rings->cq_flags; |
| 545 | + n.rings->cq_overflow = o.rings->cq_overflow; |
| 546 | + |
| 547 | + /* all done, store old pointers and assign new ones */ |
| 548 | + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) |
| 549 | + ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); |
| 550 | + |
| 551 | + ctx->sq_entries = p.sq_entries; |
| 552 | + ctx->cq_entries = p.cq_entries; |
| 553 | + |
| 554 | + ctx->rings = n.rings; |
| 555 | + ctx->sq_sqes = n.sq_sqes; |
| 556 | + swap_old(ctx, o, n, n_ring_pages); |
| 557 | + swap_old(ctx, o, n, n_sqe_pages); |
| 558 | + swap_old(ctx, o, n, ring_pages); |
| 559 | + swap_old(ctx, o, n, sqe_pages); |
| 560 | + to_free = &o; |
| 561 | + ret = 0; |
| 562 | +out: |
| 563 | + spin_unlock(&ctx->completion_lock); |
| 564 | + mutex_unlock(&ctx->resize_lock); |
| 565 | + io_register_free_rings(&p, to_free); |
| 566 | + |
| 567 | + if (ctx->sq_data) |
| 568 | + io_sq_thread_unpark(ctx->sq_data); |
| 569 | + |
| 570 | + return ret; |
| 571 | +} |
| 572 | + |
364 | 573 | static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, |
365 | 574 | void __user *arg, unsigned nr_args) |
366 | 575 | __releases(ctx->uring_lock) |
@@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, |
549 | 758 | break; |
550 | 759 | ret = io_register_clone_buffers(ctx, arg); |
551 | 760 | break; |
| 761 | + case IORING_REGISTER_RESIZE_RINGS: |
| 762 | + ret = -EINVAL; |
| 763 | + if (!arg || nr_args != 1) |
| 764 | + break; |
| 765 | + ret = io_register_resize_rings(ctx, arg); |
| 766 | + break; |
552 | 767 | default: |
553 | 768 | ret = -EINVAL; |
554 | 769 | break; |
|
0 commit comments