Skip to content

Commit d97e594

Browse files
John Garryaxboe
authored andcommitted
blk-mq: Use request queue-wide tags for tagset-wide sbitmap
The tags used for an IO scheduler are currently per hctx. As such, when q->nr_hw_queues grows, so does the request queue total IO scheduler tag depth. This may cause problems for SCSI MQ HBAs whose total driver depth is fixed. Ming and Yanhui report higher CPU usage and lower throughput in scenarios where the fixed total driver tag depth is appreciably lower than the total scheduler tag depth: https://lore.kernel.org/linux-block/[email protected]/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b In that scenario, since the scheduler tag is got first, much contention is introduced since a driver tag may not be available after we have got the sched tag. Improve this scenario by introducing request queue-wide tags for when a tagset-wide sbitmap is used. The static sched requests are still allocated per hctx, as requests are initialised per hctx, as in blk_mq_init_request(..., hctx_idx, ...) -> set->ops->init_request(.., hctx_idx, ...). For simplicity of resizing the request queue sbitmap when updating the request queue depth, just init at the max possible size, so we don't need to deal with the possibly with swapping out a new sbitmap for old if we need to grow. Signed-off-by: John Garry <[email protected]> Reviewed-by: Ming Lei <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 56b6808 commit d97e594

File tree

5 files changed

+76
-21
lines changed

5 files changed

+76
-21
lines changed

block/blk-mq-sched.c

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -509,11 +509,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
509509
struct blk_mq_hw_ctx *hctx,
510510
unsigned int hctx_idx)
511511
{
512-
unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
513-
514512
if (hctx->sched_tags) {
515513
blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
516-
blk_mq_free_rq_map(hctx->sched_tags, flags);
514+
blk_mq_free_rq_map(hctx->sched_tags, set->flags);
517515
hctx->sched_tags = NULL;
518516
}
519517
}
@@ -523,12 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
523521
unsigned int hctx_idx)
524522
{
525523
struct blk_mq_tag_set *set = q->tag_set;
526-
/* Clear HCTX_SHARED so tags are init'ed */
527-
unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
528524
int ret;
529525

530526
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
531-
set->reserved_tags, flags);
527+
set->reserved_tags, set->flags);
532528
if (!hctx->sched_tags)
533529
return -ENOMEM;
534530

@@ -546,16 +542,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
546542
int i;
547543

548544
queue_for_each_hw_ctx(q, hctx, i) {
549-
/* Clear HCTX_SHARED so tags are freed */
550-
unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
551-
552545
if (hctx->sched_tags) {
553-
blk_mq_free_rq_map(hctx->sched_tags, flags);
546+
blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
554547
hctx->sched_tags = NULL;
555548
}
556549
}
557550
}
558551

552+
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
553+
{
554+
struct blk_mq_tag_set *set = queue->tag_set;
555+
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
556+
struct blk_mq_hw_ctx *hctx;
557+
int ret, i;
558+
559+
/*
560+
* Set initial depth at max so that we don't need to reallocate for
561+
* updating nr_requests.
562+
*/
563+
ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
564+
&queue->sched_breserved_tags,
565+
MAX_SCHED_RQ, set->reserved_tags,
566+
set->numa_node, alloc_policy);
567+
if (ret)
568+
return ret;
569+
570+
queue_for_each_hw_ctx(queue, hctx, i) {
571+
hctx->sched_tags->bitmap_tags =
572+
&queue->sched_bitmap_tags;
573+
hctx->sched_tags->breserved_tags =
574+
&queue->sched_breserved_tags;
575+
}
576+
577+
sbitmap_queue_resize(&queue->sched_bitmap_tags,
578+
queue->nr_requests - set->reserved_tags);
579+
580+
return 0;
581+
}
582+
583+
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
584+
{
585+
sbitmap_queue_free(&queue->sched_bitmap_tags);
586+
sbitmap_queue_free(&queue->sched_breserved_tags);
587+
}
588+
559589
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
560590
{
561591
struct blk_mq_hw_ctx *hctx;
@@ -580,12 +610,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
580610
queue_for_each_hw_ctx(q, hctx, i) {
581611
ret = blk_mq_sched_alloc_tags(q, hctx, i);
582612
if (ret)
583-
goto err;
613+
goto err_free_tags;
614+
}
615+
616+
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
617+
ret = blk_mq_init_sched_shared_sbitmap(q);
618+
if (ret)
619+
goto err_free_tags;
584620
}
585621

586622
ret = e->ops.init_sched(q, e);
587623
if (ret)
588-
goto err;
624+
goto err_free_sbitmap;
589625

590626
blk_mq_debugfs_register_sched(q);
591627

@@ -605,7 +641,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
605641

606642
return 0;
607643

608-
err:
644+
err_free_sbitmap:
645+
if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
646+
blk_mq_exit_sched_shared_sbitmap(q);
647+
err_free_tags:
609648
blk_mq_sched_free_requests(q);
610649
blk_mq_sched_tags_teardown(q);
611650
q->elevator = NULL;
@@ -643,5 +682,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
643682
if (e->type->ops.exit_sched)
644683
e->type->ops.exit_sched(e);
645684
blk_mq_sched_tags_teardown(q);
685+
if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
686+
blk_mq_exit_sched_shared_sbitmap(q);
646687
q->elevator = NULL;
647688
}

block/blk-mq-sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include "blk-mq.h"
66
#include "blk-mq-tag.h"
77

8+
#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
9+
810
void blk_mq_sched_assign_ioc(struct request *rq);
911

1012
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,

block/blk-mq-tag.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/delay.h>
1414
#include "blk.h"
1515
#include "blk-mq.h"
16+
#include "blk-mq-sched.h"
1617
#include "blk-mq-tag.h"
1718

1819
/*
@@ -590,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
590591
*/
591592
if (tdepth > tags->nr_tags) {
592593
struct blk_mq_tag_set *set = hctx->queue->tag_set;
593-
/* Only sched tags can grow, so clear HCTX_SHARED flag */
594-
unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
595594
struct blk_mq_tags *new;
596595
bool ret;
597596

@@ -602,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
602601
* We need some sort of upper limit, set it high enough that
603602
* no valid use cases should require more.
604603
*/
605-
if (tdepth > 16 * BLKDEV_MAX_RQ)
604+
if (tdepth > MAX_SCHED_RQ)
606605
return -EINVAL;
607606

608607
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
609-
tags->nr_reserved_tags, flags);
608+
tags->nr_reserved_tags, set->flags);
610609
if (!new)
611610
return -ENOMEM;
612611
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
613612
if (ret) {
614-
blk_mq_free_rq_map(new, flags);
613+
blk_mq_free_rq_map(new, set->flags);
615614
return -ENOMEM;
616615
}
617616

618617
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
619-
blk_mq_free_rq_map(*tagsptr, flags);
618+
blk_mq_free_rq_map(*tagsptr, set->flags);
620619
*tagsptr = new;
621620
} else {
622621
/*

block/blk-mq.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3640,15 +3640,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
36403640
} else {
36413641
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
36423642
nr, true);
3643+
if (blk_mq_is_sbitmap_shared(set->flags)) {
3644+
hctx->sched_tags->bitmap_tags =
3645+
&q->sched_bitmap_tags;
3646+
hctx->sched_tags->breserved_tags =
3647+
&q->sched_breserved_tags;
3648+
}
36433649
}
36443650
if (ret)
36453651
break;
36463652
if (q->elevator && q->elevator->type->ops.depth_updated)
36473653
q->elevator->type->ops.depth_updated(hctx);
36483654
}
3649-
3650-
if (!ret)
3655+
if (!ret) {
36513656
q->nr_requests = nr;
3657+
if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
3658+
sbitmap_queue_resize(&q->sched_bitmap_tags,
3659+
nr - set->reserved_tags);
3660+
}
36523661

36533662
blk_mq_unquiesce_queue(q);
36543663
blk_mq_unfreeze_queue(q);

include/linux/blkdev.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/scatterlist.h>
2626
#include <linux/blkzoned.h>
2727
#include <linux/pm.h>
28+
#include <linux/sbitmap.h>
2829

2930
struct module;
3031
struct scsi_ioctl_command;
@@ -493,6 +494,9 @@ struct request_queue {
493494

494495
atomic_t nr_active_requests_shared_sbitmap;
495496

497+
struct sbitmap_queue sched_bitmap_tags;
498+
struct sbitmap_queue sched_breserved_tags;
499+
496500
struct list_head icq_list;
497501
#ifdef CONFIG_BLK_CGROUP
498502
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);

0 commit comments

Comments
 (0)