Skip to content

Commit 430a67f

Browse files
Algodev-githubaxboe
authored andcommitted
block, bfq: merge bursts of newly-created queues
Many throughput-sensitive workloads are made of several parallel I/O flows, with all flows generated by the same application, or more generically by the same task (e.g., system boot). The most counterproductive action with these workloads is plugging I/O dispatch when one of the bfq_queues associated with these flows remains temporarily empty. To avoid this plugging, BFQ has been using a burst-handling mechanism for years now. This mechanism has proven effective for throughput, and not detrimental for service guarantees. This commit pushes this mechanism a little bit further, basing on the following two facts. First, all the I/O flows of a the same application or task contribute to the execution/completion of that common application or task. So the performance figures that matter are total throughput of the flows and task-wide I/O latency. In particular, these flows do not need to be protected from each other, in terms of individual bandwidth or latency. Second, the above fact holds regardless of the number of flows. Putting these two facts together, this commits merges stably the bfq_queues associated with these I/O flows, i.e., with the processes that generate these IO/ flows, regardless of how many the involved processes are. To decide whether a set of bfq_queues is actually associated with the I/O flows of a common application or task, and to merge these queues stably, this commit operates as follows: given a bfq_queue, say Q2, currently being created, and the last bfq_queue, say Q1, created before Q2, Q2 is merged stably with Q1 if - very little time has elapsed since when Q1 was created - Q2 has the same ioprio as Q1 - Q2 belongs to the same group as Q1 Merging bfq_queues also reduces scheduling overhead. A fio test with ten random readers on /dev/nullb shows a throughput boost of 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of the total per-request processing time, the above throughput boost implies that BFQ's overhead is reduced by more than 50%. Tested-by: Jan Kara <[email protected]> Signed-off-by: Paolo Valente <[email protected]> Tested-by: Oleksandr Natalenko <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 85686d0 commit 430a67f

File tree

3 files changed

+266
-10
lines changed

3 files changed

+266
-10
lines changed

block/bfq-cgroup.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
547547

548548
entity->orig_weight = entity->weight = entity->new_weight = d->weight;
549549
entity->my_sched_data = &bfqg->sched_data;
550+
entity->last_bfqq_created = NULL;
551+
550552
bfqg->my_entity = entity; /*
551553
* the root_group's will be set to NULL
552554
* in bfq_init_queue()

block/bfq-iosched.c

Lines changed: 249 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,7 +1075,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
10751075
static int bfqq_process_refs(struct bfq_queue *bfqq)
10761076
{
10771077
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
1078-
(bfqq->weight_counter != NULL);
1078+
(bfqq->weight_counter != NULL) - bfqq->stable_ref;
10791079
}
10801080

10811081
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -2628,6 +2628,11 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
26282628
return true;
26292629
}
26302630

2631+
static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
2632+
struct bfq_queue *bfqq);
2633+
2634+
static void bfq_put_stable_ref(struct bfq_queue *bfqq);
2635+
26312636
/*
26322637
* Attempt to schedule a merge of bfqq with the currently in-service
26332638
* queue or with a close queue among the scheduled queues. Return
@@ -2650,10 +2655,49 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
26502655
*/
26512656
static struct bfq_queue *
26522657
bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2653-
void *io_struct, bool request)
2658+
void *io_struct, bool request, struct bfq_io_cq *bic)
26542659
{
26552660
struct bfq_queue *in_service_bfqq, *new_bfqq;
26562661

2662+
/*
2663+
* Check delayed stable merge for rotational or non-queueing
2664+
* devs. For this branch to be executed, bfqq must not be
2665+
* currently merged with some other queue (i.e., bfqq->bic
2666+
* must be non null). If we considered also merged queues,
2667+
* then we should also check whether bfqq has already been
2668+
* merged with bic->stable_merge_bfqq. But this would be
2669+
* costly and complicated.
2670+
*/
2671+
if (unlikely(!bfqd->nonrot_with_queueing)) {
2672+
if (bic->stable_merge_bfqq &&
2673+
!bfq_bfqq_just_created(bfqq) &&
2674+
time_is_after_jiffies(bfqq->split_time +
2675+
msecs_to_jiffies(200))) {
2676+
struct bfq_queue *stable_merge_bfqq =
2677+
bic->stable_merge_bfqq;
2678+
int proc_ref = min(bfqq_process_refs(bfqq),
2679+
bfqq_process_refs(stable_merge_bfqq));
2680+
2681+
/* deschedule stable merge, because done or aborted here */
2682+
bfq_put_stable_ref(stable_merge_bfqq);
2683+
2684+
bic->stable_merge_bfqq = NULL;
2685+
2686+
if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
2687+
proc_ref > 0) {
2688+
/* next function will take at least one ref */
2689+
struct bfq_queue *new_bfqq =
2690+
bfq_setup_merge(bfqq, stable_merge_bfqq);
2691+
2692+
bic->stably_merged = true;
2693+
if (new_bfqq && new_bfqq->bic)
2694+
new_bfqq->bic->stably_merged = true;
2695+
return new_bfqq;
2696+
} else
2697+
return NULL;
2698+
}
2699+
}
2700+
26572701
/*
26582702
* Do not perform queue merging if the device is non
26592703
* rotational and performs internal queueing. In fact, such a
@@ -2795,6 +2839,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
27952839
}
27962840
}
27972841

2842+
2843+
static void
2844+
bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
2845+
{
2846+
if (cur_bfqq->entity.parent &&
2847+
cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
2848+
cur_bfqq->entity.parent->last_bfqq_created = new_bfqq;
2849+
else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq)
2850+
cur_bfqq->bfqd->last_bfqq_created = new_bfqq;
2851+
}
2852+
27982853
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
27992854
{
28002855
/*
@@ -2812,6 +2867,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
28122867
bfqq != bfqd->in_service_queue)
28132868
bfq_del_bfqq_busy(bfqd, bfqq, false);
28142869

2870+
bfq_reassign_last_bfqq(bfqq, NULL);
2871+
28152872
bfq_put_queue(bfqq);
28162873
}
28172874

@@ -2908,6 +2965,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
29082965
*/
29092966
new_bfqq->pid = -1;
29102967
bfqq->bic = NULL;
2968+
2969+
bfq_reassign_last_bfqq(bfqq, new_bfqq);
2970+
29112971
bfq_release_process_ref(bfqd, bfqq);
29122972
}
29132973

@@ -2935,7 +2995,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
29352995
* We take advantage of this function to perform an early merge
29362996
* of the queues of possible cooperating processes.
29372997
*/
2938-
new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
2998+
new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic);
29392999
if (new_bfqq) {
29403000
/*
29413001
* bic still points to bfqq, then it has not yet been
@@ -5034,6 +5094,12 @@ void bfq_put_queue(struct bfq_queue *bfqq)
50345094
bfqg_and_blkg_put(bfqg);
50355095
}
50365096

5097+
static void bfq_put_stable_ref(struct bfq_queue *bfqq)
5098+
{
5099+
bfqq->stable_ref--;
5100+
bfq_put_queue(bfqq);
5101+
}
5102+
50375103
static void bfq_put_cooperator(struct bfq_queue *bfqq)
50385104
{
50395105
struct bfq_queue *__bfqq, *next;
@@ -5090,6 +5156,24 @@ static void bfq_exit_icq(struct io_cq *icq)
50905156
{
50915157
struct bfq_io_cq *bic = icq_to_bic(icq);
50925158

5159+
if (bic->stable_merge_bfqq) {
5160+
struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
5161+
5162+
/*
5163+
* bfqd is NULL if scheduler already exited, and in
5164+
* that case this is the last time bfqq is accessed.
5165+
*/
5166+
if (bfqd) {
5167+
unsigned long flags;
5168+
5169+
spin_lock_irqsave(&bfqd->lock, flags);
5170+
bfq_put_stable_ref(bic->stable_merge_bfqq);
5171+
spin_unlock_irqrestore(&bfqd->lock, flags);
5172+
} else {
5173+
bfq_put_stable_ref(bic->stable_merge_bfqq);
5174+
}
5175+
}
5176+
50935177
bfq_exit_icq_bfqq(bic, true);
50945178
bfq_exit_icq_bfqq(bic, false);
50955179
}
@@ -5150,7 +5234,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
51505234

51515235
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
51525236
struct bio *bio, bool is_sync,
5153-
struct bfq_io_cq *bic);
5237+
struct bfq_io_cq *bic,
5238+
bool respawn);
51545239

51555240
static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
51565241
{
@@ -5170,7 +5255,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
51705255
bfqq = bic_to_bfqq(bic, false);
51715256
if (bfqq) {
51725257
bfq_release_process_ref(bfqd, bfqq);
5173-
bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
5258+
bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
51745259
bic_set_bfqq(bic, bfqq, false);
51755260
}
51765261

@@ -5213,6 +5298,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
52135298
/* set end request to minus infinity from now */
52145299
bfqq->ttime.last_end_request = now_ns + 1;
52155300

5301+
bfqq->creation_time = jiffies;
5302+
52165303
bfqq->io_start_time = now_ns;
52175304

52185305
bfq_mark_bfqq_IO_bound(bfqq);
@@ -5262,9 +5349,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
52625349
}
52635350
}
52645351

5352+
static struct bfq_queue *
5353+
bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5354+
struct bfq_io_cq *bic,
5355+
struct bfq_queue *last_bfqq_created)
5356+
{
5357+
struct bfq_queue *new_bfqq =
5358+
bfq_setup_merge(bfqq, last_bfqq_created);
5359+
5360+
if (!new_bfqq)
5361+
return bfqq;
5362+
5363+
if (new_bfqq->bic)
5364+
new_bfqq->bic->stably_merged = true;
5365+
bic->stably_merged = true;
5366+
5367+
/*
5368+
* Reusing merge functions. This implies that
5369+
* bfqq->bic must be set too, for
5370+
* bfq_merge_bfqqs to correctly save bfqq's
5371+
* state before killing it.
5372+
*/
5373+
bfqq->bic = bic;
5374+
bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
5375+
5376+
return new_bfqq;
5377+
}
5378+
5379+
/*
5380+
* Many throughput-sensitive workloads are made of several parallel
5381+
* I/O flows, with all flows generated by the same application, or
5382+
* more generically by the same task (e.g., system boot). The most
5383+
* counterproductive action with these workloads is plugging I/O
5384+
* dispatch when one of the bfq_queues associated with these flows
5385+
* remains temporarily empty.
5386+
*
5387+
* To avoid this plugging, BFQ has been using a burst-handling
5388+
* mechanism for years now. This mechanism has proven effective for
5389+
* throughput, and not detrimental for service guarantees. The
5390+
* following function pushes this mechanism a little bit further,
5391+
* basing on the following two facts.
5392+
*
5393+
* First, all the I/O flows of a the same application or task
5394+
* contribute to the execution/completion of that common application
5395+
* or task. So the performance figures that matter are total
5396+
* throughput of the flows and task-wide I/O latency. In particular,
5397+
* these flows do not need to be protected from each other, in terms
5398+
* of individual bandwidth or latency.
5399+
*
5400+
* Second, the above fact holds regardless of the number of flows.
5401+
*
5402+
* Putting these two facts together, this commits merges stably the
5403+
* bfq_queues associated with these I/O flows, i.e., with the
5404+
* processes that generate these IO/ flows, regardless of how many the
5405+
* involved processes are.
5406+
*
5407+
* To decide whether a set of bfq_queues is actually associated with
5408+
* the I/O flows of a common application or task, and to merge these
5409+
* queues stably, this function operates as follows: given a bfq_queue,
5410+
* say Q2, currently being created, and the last bfq_queue, say Q1,
5411+
* created before Q2, Q2 is merged stably with Q1 if
5412+
* - very little time has elapsed since when Q1 was created
5413+
* - Q2 has the same ioprio as Q1
5414+
* - Q2 belongs to the same group as Q1
5415+
*
5416+
* Merging bfq_queues also reduces scheduling overhead. A fio test
5417+
* with ten random readers on /dev/nullb shows a throughput boost of
5418+
* 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of
5419+
* the total per-request processing time, the above throughput boost
5420+
* implies that BFQ's overhead is reduced by more than 50%.
5421+
*
5422+
* This new mechanism most certainly obsoletes the current
5423+
* burst-handling heuristics. We keep those heuristics for the moment.
5424+
*/
5425+
static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
5426+
struct bfq_queue *bfqq,
5427+
struct bfq_io_cq *bic)
5428+
{
5429+
struct bfq_queue **source_bfqq = bfqq->entity.parent ?
5430+
&bfqq->entity.parent->last_bfqq_created :
5431+
&bfqd->last_bfqq_created;
5432+
5433+
struct bfq_queue *last_bfqq_created = *source_bfqq;
5434+
5435+
/*
5436+
* If last_bfqq_created has not been set yet, then init it. If
5437+
* it has been set already, but too long ago, then move it
5438+
* forward to bfqq. Finally, move also if bfqq belongs to a
5439+
* different group than last_bfqq_created, or if bfqq has a
5440+
* different ioprio or ioprio_class. If none of these
5441+
* conditions holds true, then try an early stable merge or
5442+
* schedule a delayed stable merge.
5443+
*
5444+
* A delayed merge is scheduled (instead of performing an
5445+
* early merge), in case bfqq might soon prove to be more
5446+
* throughput-beneficial if not merged. Currently this is
5447+
* possible only if bfqd is rotational with no queueing. For
5448+
* such a drive, not merging bfqq is better for throughput if
5449+
* bfqq happens to contain sequential I/O. So, we wait a
5450+
* little bit for enough I/O to flow through bfqq. After that,
5451+
* if such an I/O is sequential, then the merge is
5452+
* canceled. Otherwise the merge is finally performed.
5453+
*/
5454+
if (!last_bfqq_created ||
5455+
time_before(last_bfqq_created->creation_time +
5456+
bfqd->bfq_burst_interval,
5457+
bfqq->creation_time) ||
5458+
bfqq->entity.parent != last_bfqq_created->entity.parent ||
5459+
bfqq->ioprio != last_bfqq_created->ioprio ||
5460+
bfqq->ioprio_class != last_bfqq_created->ioprio_class)
5461+
*source_bfqq = bfqq;
5462+
else if (time_after_eq(last_bfqq_created->creation_time +
5463+
bfqd->bfq_burst_interval,
5464+
bfqq->creation_time)) {
5465+
if (likely(bfqd->nonrot_with_queueing))
5466+
/*
5467+
* With this type of drive, leaving
5468+
* bfqq alone may provide no
5469+
* throughput benefits compared with
5470+
* merging bfqq. So merge bfqq now.
5471+
*/
5472+
bfqq = bfq_do_early_stable_merge(bfqd, bfqq,
5473+
bic,
5474+
last_bfqq_created);
5475+
else { /* schedule tentative stable merge */
5476+
/*
5477+
* get reference on last_bfqq_created,
5478+
* to prevent it from being freed,
5479+
* until we decide whether to merge
5480+
*/
5481+
last_bfqq_created->ref++;
5482+
/*
5483+
* need to keep track of stable refs, to
5484+
* compute process refs correctly
5485+
*/
5486+
last_bfqq_created->stable_ref++;
5487+
/*
5488+
* Record the bfqq to merge to.
5489+
*/
5490+
bic->stable_merge_bfqq = last_bfqq_created;
5491+
}
5492+
}
5493+
5494+
return bfqq;
5495+
}
5496+
5497+
52655498
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
52665499
struct bio *bio, bool is_sync,
5267-
struct bfq_io_cq *bic)
5500+
struct bfq_io_cq *bic,
5501+
bool respawn)
52685502
{
52695503
const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
52705504
const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
@@ -5322,7 +5556,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
53225556

53235557
out:
53245558
bfqq->ref++; /* get a process reference to this queue */
5325-
bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
5559+
5560+
if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn)
5561+
bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic);
5562+
53265563
rcu_read_unlock();
53275564
return bfqq;
53285565
}
@@ -5572,7 +5809,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
55725809
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
55735810
{
55745811
struct bfq_queue *bfqq = RQ_BFQQ(rq),
5575-
*new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
5812+
*new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true,
5813+
RQ_BIC(rq));
55765814
bool waiting, idle_timer_disabled = false;
55775815

55785816
if (new_bfqq) {
@@ -6227,7 +6465,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
62276465

62286466
if (bfqq)
62296467
bfq_put_queue(bfqq);
6230-
bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
6468+
bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
62316469

62326470
bic_set_bfqq(bic, bfqq, is_sync);
62336471
if (split && is_sync) {
@@ -6348,7 +6586,8 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
63486586

63496587
if (likely(!new_queue)) {
63506588
/* If the queue was seeky for too long, break it apart. */
6351-
if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
6589+
if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
6590+
!bic->stably_merged) {
63526591
struct bfq_queue *old_bfqq = bfqq;
63536592

63546593
/* Update bic before losing reference to bfqq */

0 commit comments

Comments
 (0)