Skip to content

Commit f1de243

Browse files
htejunaxboe
authored andcommitted
blk-iocost: revamp donation amount determination
iocost has various safety nets to combat inuse adjustment calculation inaccuracies. With Andy's method implemented in transfer_surpluses(), inuse adjustment calculations are now accurate and we can make donation amount determinations accurate too. * Stop keeping track of past usage history and using the maximum. Act on the immediate usage information. * Remove donation constraints defined by SURPLUS_* constants. Donate whatever isn't used. * Determine the donation amount so that the iocg will end up with MARGIN_TARGET_PCT budget at the end of the coming period assuming the same usage as the previous period. TARGET is set at 50% of period, which is the previous maximum. This provides smooth convergence for most repetitive IO patterns. * Apply donation logic early at 20% budget. There's no risk in doing so as the calculation is based on the delta between the current budget and the target budget at the end of the coming period. * Remove preemptive iocg activation for zero cost IOs. As donation can reach near zero now, the mere activation doesn't provide any protection anymore. In the unlikely case that this becomes a problem, the right solution is assigning appropriate costs for such IOs. This significantly improves the donation determination logic while also simplifying it. Now all donations are immediate, exact and smooth. Signed-off-by: Tejun Heo <[email protected]> Cc: Andy Newell <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent e08d02a commit f1de243

File tree

1 file changed

+51
-82
lines changed

1 file changed

+51
-82
lines changed

block/blk-iocost.c

Lines changed: 51 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -217,12 +217,14 @@ enum {
217217
MAX_PERIOD = USEC_PER_SEC,
218218

219219
/*
220-
* A cgroup's vtime can run 50% behind the device vtime, which
220+
* iocg->vtime is targeted at 50% behind the device vtime, which
221221
* serves as its IO credit buffer. Surplus weight adjustment is
222222
* immediately canceled if the vtime margin runs below 10%.
223223
*/
224224
MARGIN_MIN_PCT = 10,
225-
MARGIN_MAX_PCT = 50,
225+
MARGIN_LOW_PCT = 20,
226+
MARGIN_TARGET_PCT = 50,
227+
MARGIN_MAX_PCT = 100,
226228

227229
/* Have some play in timer operations */
228230
TIMER_SLACK_PCT = 1,
@@ -234,17 +236,6 @@ enum {
234236
*/
235237
VTIME_VALID_DUR = 300 * USEC_PER_SEC,
236238

237-
/*
238-
* Remember the past three non-zero usages and use the max for
239-
* surplus calculation. Three slots guarantee that we remember one
240-
* full period usage from the last active stretch even after
241-
* partial deactivation and re-activation periods. Don't start
242-
* giving away weight before collecting two data points to prevent
243-
* hweight adjustments based on one partial activation period.
244-
*/
245-
NR_USAGE_SLOTS = 3,
246-
MIN_VALID_USAGES = 2,
247-
248239
/* 1/64k is granular enough and can easily be handled w/ u32 */
249240
WEIGHT_ONE = 1 << 16,
250241

@@ -280,14 +271,6 @@ enum {
280271
/* don't let cmds which take a very long time pin lagging for too long */
281272
MAX_LAGGING_PERIODS = 10,
282273

283-
/*
284-
* If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285-
* donate the surplus.
286-
*/
287-
SURPLUS_SCALE_PCT = 125, /* * 125% */
288-
SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */
289-
SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */
290-
291274
/* switch iff the conditions are met for longer than this */
292275
AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
293276

@@ -376,6 +359,8 @@ struct ioc_params {
376359

377360
struct ioc_margins {
378361
s64 min;
362+
s64 low;
363+
s64 target;
379364
s64 max;
380365
};
381366

@@ -514,11 +499,7 @@ struct ioc_gq {
514499
struct iocg_stat desc_stat;
515500
struct iocg_stat last_stat;
516501
u64 last_stat_abs_vusage;
517-
518-
/* usage is recorded as fractions of WEIGHT_ONE */
519-
u32 usage_delta_us;
520-
int usage_idx;
521-
u32 usages[NR_USAGE_SLOTS];
502+
u64 usage_delta_us;
522503

523504
/* this iocg's depth in the hierarchy and ancestors including self */
524505
int level;
@@ -737,6 +718,8 @@ static void ioc_refresh_margins(struct ioc *ioc)
737718
u64 vrate = atomic64_read(&ioc->vtime_rate);
738719

739720
margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
721+
margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
722+
margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
740723
margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
741724
}
742725

@@ -1228,7 +1211,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
12281211
return false;
12291212
}
12301213
if (!atomic_read(&blkg->use_delay) &&
1231-
time_before_eq64(vtime, now->vnow + ioc->margins.max))
1214+
time_before_eq64(vtime, now->vnow + ioc->margins.target))
12321215
return false;
12331216

12341217
/* use delay */
@@ -1527,7 +1510,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
15271510
{
15281511
struct ioc *ioc = iocg->ioc;
15291512
u64 vtime = atomic64_read(&iocg->vtime);
1530-
s64 excess;
1513+
s64 excess, delta, target, new_hwi;
15311514

15321515
/* see whether minimum margin requirement is met */
15331516
if (waitqueue_active(&iocg->waitq) ||
@@ -1542,15 +1525,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
15421525
vtime += excess;
15431526
}
15441527

1545-
/* add margin */
1546-
usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1547-
usage += SURPLUS_SCALE_ABS;
1548-
1549-
/* don't bother if the surplus is too small */
1550-
if (usage + SURPLUS_MIN_ADJ_DELTA > hwm)
1551-
return hwm;
1528+
/*
1529+
* Let's say the distance between iocg's and device's vtimes as a
1530+
* fraction of period duration is delta. Assuming that the iocg will
1531+
* consume the usage determined above, we want to determine new_hwi so
1532+
* that delta equals MARGIN_TARGET at the end of the next period.
1533+
*
1534+
* We need to execute usage worth of IOs while spending the sum of the
1535+
* new budget (1 - MARGIN_TARGET) and the leftover from the last period
1536+
* (delta):
1537+
*
1538+
* usage = (1 - MARGIN_TARGET + delta) * new_hwi
1539+
*
1540+
* Therefore, the new_hwi is:
1541+
*
1542+
* new_hwi = usage / (1 - MARGIN_TARGET + delta)
1543+
*/
1544+
delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1545+
now->vnow - ioc->period_at_vtime);
1546+
target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1547+
new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
15521548

1553-
return usage;
1549+
return clamp_t(s64, new_hwi, 1, hwm);
15541550
}
15551551

15561552
/*
@@ -1812,7 +1808,7 @@ static void ioc_timer_fn(struct timer_list *timer)
18121808
u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
18131809
u32 missed_ppm[2], rq_wait_pct;
18141810
u64 period_vtime;
1815-
int prev_busy_level, i;
1811+
int prev_busy_level;
18161812

18171813
/* how were the latencies during the period? */
18181814
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
@@ -1857,11 +1853,10 @@ static void ioc_timer_fn(struct timer_list *timer)
18571853
}
18581854
commit_weights(ioc);
18591855

1860-
/* calc usages and see whether some weights need to be moved around */
1856+
/* calc usage and see whether some weights need to be moved around */
18611857
list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1862-
u64 vdone, vtime, usage_us;
1863-
u32 hw_active, hw_inuse, usage;
1864-
int uidx, nr_valid;
1858+
u64 vdone, vtime, usage_us, usage_dur;
1859+
u32 usage, hw_active, hw_inuse;
18651860

18661861
/*
18671862
* Collect unused and wind vtime closer to vnow to prevent
@@ -1886,59 +1881,34 @@ static void ioc_timer_fn(struct timer_list *timer)
18861881
nr_lagging++;
18871882

18881883
/*
1889-
* Determine absolute usage factoring in pending and in-flight
1890-
* IOs to avoid stalls and high-latency completions appearing as
1891-
* idle.
1884+
* Determine absolute usage factoring in in-flight IOs to avoid
1885+
* high-latency completions appearing as idle.
18921886
*/
18931887
usage_us = iocg->usage_delta_us;
1894-
if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
1895-
usage_us += DIV64_U64_ROUND_UP(
1896-
cost_to_abs_cost(now.vnow - vtime, hw_inuse),
1897-
now.vrate);
1888+
18981889
if (vdone != vtime) {
18991890
u64 inflight_us = DIV64_U64_ROUND_UP(
19001891
cost_to_abs_cost(vtime - vdone, hw_inuse),
19011892
now.vrate);
19021893
usage_us = max(usage_us, inflight_us);
19031894
}
19041895

1905-
/* convert to hweight based usage ratio and record */
1906-
uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1907-
1908-
if (time_after64(vtime, now.vnow - ioc->margins.min)) {
1909-
iocg->usage_idx = uidx;
1910-
iocg->usages[uidx] = WEIGHT_ONE;
1911-
} else if (usage_us) {
1912-
u64 started_at, dur;
1913-
1914-
if (time_after64(iocg->activated_at, ioc->period_at))
1915-
started_at = iocg->activated_at;
1916-
else
1917-
started_at = ioc->period_at;
1918-
1919-
dur = max_t(u64, now.now - started_at, 1);
1896+
/* convert to hweight based usage ratio */
1897+
if (time_after64(iocg->activated_at, ioc->period_at))
1898+
usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
1899+
else
1900+
usage_dur = max_t(u64, now.now - ioc->period_at, 1);
19201901

1921-
iocg->usage_idx = uidx;
1922-
iocg->usages[uidx] = clamp_t(u32,
1923-
DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
1902+
usage = clamp_t(u32,
1903+
DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
1904+
usage_dur),
19241905
1, WEIGHT_ONE);
1925-
}
1926-
1927-
/* base the decision on max historical usage */
1928-
for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) {
1929-
if (iocg->usages[i]) {
1930-
usage = max(usage, iocg->usages[i]);
1931-
nr_valid++;
1932-
}
1933-
}
1934-
if (nr_valid < MIN_VALID_USAGES)
1935-
usage = WEIGHT_ONE;
19361906

19371907
/* see whether there's surplus vtime */
19381908
WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
19391909
if (hw_inuse < hw_active ||
19401910
(!waitqueue_active(&iocg->waitq) &&
1941-
time_before64(vtime, now.vnow - ioc->margins.max))) {
1911+
time_before64(vtime, now.vnow - ioc->margins.low))) {
19421912
u32 hwa, hwm, new_hwi;
19431913

19441914
/*
@@ -2175,15 +2145,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
21752145
if (!ioc->enabled || !iocg->level)
21762146
return;
21772147

2178-
/* always activate so that even 0 cost IOs get protected to some level */
2179-
if (!iocg_activate(iocg, &now))
2180-
return;
2181-
21822148
/* calculate the absolute vtime cost */
21832149
abs_cost = calc_vtime_cost(bio, iocg, false);
21842150
if (!abs_cost)
21852151
return;
21862152

2153+
if (!iocg_activate(iocg, &now))
2154+
return;
2155+
21872156
iocg->cursor = bio_end_sector(bio);
21882157

21892158
vtime = atomic64_read(&iocg->vtime);

0 commit comments

Comments
 (0)