2727#include <linux/atomic.h>
2828#include <linux/ctype.h>
2929#include <linux/blk-cgroup.h>
30+ #include <linux/tracehook.h>
3031#include "blk.h"
3132
3233#define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
9991000 if (!blkcg_debug_stats )
10001001 goto next ;
10011002
1003+ if (atomic_read (& blkg -> use_delay )) {
1004+ has_stats = true;
1005+ off += scnprintf (buf + off , size - off ,
1006+ " use_delay=%d delay_nsec=%llu" ,
1007+ atomic_read (& blkg -> use_delay ),
1008+ (unsigned long long )atomic64_read (& blkg -> delay_nsec ));
1009+ }
1010+
10021011 for (i = 0 ; i < BLKCG_MAX_POLS ; i ++ ) {
10031012 struct blkcg_policy * pol = blkcg_policy [i ];
10041013 size_t written ;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
13261335 mutex_unlock (& blkcg_pol_mutex );
13271336}
13281337
1338+ static void blkcg_exit (struct task_struct * tsk )
1339+ {
1340+ if (tsk -> throttle_queue )
1341+ blk_put_queue (tsk -> throttle_queue );
1342+ tsk -> throttle_queue = NULL ;
1343+ }
1344+
13291345struct cgroup_subsys io_cgrp_subsys = {
13301346 .css_alloc = blkcg_css_alloc ,
13311347 .css_offline = blkcg_css_offline ,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
13351351 .dfl_cftypes = blkcg_files ,
13361352 .legacy_cftypes = blkcg_legacy_files ,
13371353 .legacy_name = "blkio" ,
1354+ .exit = blkcg_exit ,
13381355#ifdef CONFIG_MEMCG
13391356 /*
13401357 * This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
15861603}
15871604EXPORT_SYMBOL_GPL (blkcg_policy_unregister );
15881605
1606+ /*
1607+ * Scale the accumulated delay based on how long it has been since we updated
1608+ * the delay. We only call this when we are adding delay, in case it's been a
1609+ * while since we added delay, and when we are checking to see if we need to
1610+ * delay a task, to account for any delays that may have occurred.
1611+ */
1612+ static void blkcg_scale_delay (struct blkcg_gq * blkg , u64 now )
1613+ {
1614+ u64 old = atomic64_read (& blkg -> delay_start );
1615+
1616+ /*
1617+ * We only want to scale down every second. The idea here is that we
1618+ * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1619+ * time window. We only want to throttle tasks for recent delay that
1620+ * has occurred, in 1 second time windows since that's the maximum
1621+ * things can be throttled. We save the current delay window in
1622+ * blkg->last_delay so we know what amount is still left to be charged
1623+ * to the blkg from this point onward. blkg->last_use keeps track of
1624+ * the use_delay counter. The idea is if we're unthrottling the blkg we
1625+ * are ok with whatever is happening now, and we can take away more of
1626+ * the accumulated delay as we've already throttled enough that
1627+ * everybody is happy with their IO latencies.
1628+ */
1629+ if (time_before64 (old + NSEC_PER_SEC , now ) &&
1630+ atomic64_cmpxchg (& blkg -> delay_start , old , now ) == old ) {
1631+ u64 cur = atomic64_read (& blkg -> delay_nsec );
1632+ u64 sub = min_t (u64 , blkg -> last_delay , now - old );
1633+ int cur_use = atomic_read (& blkg -> use_delay );
1634+
1635+ /*
1636+ * We've been unthrottled, subtract a larger chunk of our
1637+ * accumulated delay.
1638+ */
1639+ if (cur_use < blkg -> last_use )
1640+ sub = max_t (u64 , sub , blkg -> last_delay >> 1 );
1641+
1642+ /*
1643+ * This shouldn't happen, but handle it anyway. Our delay_nsec
1644+ * should only ever be growing except here where we subtract out
1645+ * min(last_delay, 1 second), but lord knows bugs happen and I'd
1646+ * rather not end up with negative numbers.
1647+ */
1648+ if (unlikely (cur < sub )) {
1649+ atomic64_set (& blkg -> delay_nsec , 0 );
1650+ blkg -> last_delay = 0 ;
1651+ } else {
1652+ atomic64_sub (sub , & blkg -> delay_nsec );
1653+ blkg -> last_delay = cur - sub ;
1654+ }
1655+ blkg -> last_use = cur_use ;
1656+ }
1657+ }
1658+
1659+ /*
1660+ * This is called when we want to actually walk up the hierarchy and check to
1661+ * see if we need to throttle, and then actually throttle if there is some
1662+ * accumulated delay. This should only be called upon return to user space so
1663+ * we're not holding some lock that would induce a priority inversion.
1664+ */
1665+ static void blkcg_maybe_throttle_blkg (struct blkcg_gq * blkg , bool use_memdelay )
1666+ {
1667+ u64 now = ktime_to_ns (ktime_get ());
1668+ u64 exp ;
1669+ u64 delay_nsec = 0 ;
1670+ int tok ;
1671+
1672+ while (blkg -> parent ) {
1673+ if (atomic_read (& blkg -> use_delay )) {
1674+ blkcg_scale_delay (blkg , now );
1675+ delay_nsec = max_t (u64 , delay_nsec ,
1676+ atomic64_read (& blkg -> delay_nsec ));
1677+ }
1678+ blkg = blkg -> parent ;
1679+ }
1680+
1681+ if (!delay_nsec )
1682+ return ;
1683+
1684+ /*
1685+ * Let's not sleep for all eternity if we've amassed a huge delay.
1686+ * Swapping or metadata IO can accumulate 10's of seconds worth of
1687+ * delay, and we want userspace to be able to do _something_ so cap the
1688+ * delays at 1 second. If there's 10's of seconds worth of delay then
1689+ * the tasks will be delayed for 1 second for every syscall.
1690+ */
1691+ delay_nsec = min_t (u64 , delay_nsec , 250 * NSEC_PER_MSEC );
1692+
1693+ /*
1694+ * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1695+ * that hasn't landed upstream yet. Once that stuff is in place we need
1696+ * to do a psi_memstall_enter/leave if memdelay is set.
1697+ */
1698+
1699+ exp = ktime_add_ns (now , delay_nsec );
1700+ tok = io_schedule_prepare ();
1701+ do {
1702+ __set_current_state (TASK_KILLABLE );
1703+ if (!schedule_hrtimeout (& exp , HRTIMER_MODE_ABS ))
1704+ break ;
1705+ } while (!fatal_signal_pending (current ));
1706+ io_schedule_finish (tok );
1707+ }
1708+
1709+ /**
1710+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1711+ *
1712+ * This is only called if we've been marked with set_notify_resume(). Obviously
1713+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1714+ * check to see if current->throttle_queue is set and if not this doesn't do
1715+ * anything. This should only ever be called by the resume code, it's not meant
1716+ * to be called by people willy-nilly as it will actually do the work to
1717+ * throttle the task if it is setup for throttling.
1718+ */
1719+ void blkcg_maybe_throttle_current (void )
1720+ {
1721+ struct request_queue * q = current -> throttle_queue ;
1722+ struct cgroup_subsys_state * css ;
1723+ struct blkcg * blkcg ;
1724+ struct blkcg_gq * blkg ;
1725+ bool use_memdelay = current -> use_memdelay ;
1726+
1727+ if (!q )
1728+ return ;
1729+
1730+ current -> throttle_queue = NULL ;
1731+ current -> use_memdelay = false;
1732+
1733+ rcu_read_lock ();
1734+ css = kthread_blkcg ();
1735+ if (css )
1736+ blkcg = css_to_blkcg (css );
1737+ else
1738+ blkcg = css_to_blkcg (task_css (current , io_cgrp_id ));
1739+
1740+ if (!blkcg )
1741+ goto out ;
1742+ blkg = blkg_lookup (blkcg , q );
1743+ if (!blkg )
1744+ goto out ;
1745+ blkg = blkg_try_get (blkg );
1746+ if (!blkg )
1747+ goto out ;
1748+ rcu_read_unlock ();
1749+ blk_put_queue (q );
1750+
1751+ blkcg_maybe_throttle_blkg (blkg , use_memdelay );
1752+ blkg_put (blkg );
1753+ return ;
1754+ out :
1755+ rcu_read_unlock ();
1756+ blk_put_queue (q );
1757+ }
1758+ EXPORT_SYMBOL_GPL (blkcg_maybe_throttle_current );
1759+
1760+ /**
1761+ * blkcg_schedule_throttle - this task needs to check for throttling
1762+ * @q - the request queue IO was submitted on
1763+ * @use_memdelay - do we charge this to memory delay for PSI
1764+ *
1765+ * This is called by the IO controller when we know there's delay accumulated
1766+ * for the blkg for this task. We do not pass the blkg because there are places
1767+ * we call this that may not have that information, the swapping code for
1768+ * instance will only have a request_queue at that point. This set's the
1769+ * notify_resume for the task to check and see if it requires throttling before
1770+ * returning to user space.
1771+ *
1772+ * We will only schedule once per syscall. You can call this over and over
1773+ * again and it will only do the check once upon return to user space, and only
1774+ * throttle once. If the task needs to be throttled again it'll need to be
1775+ * re-set at the next time we see the task.
1776+ */
1777+ void blkcg_schedule_throttle (struct request_queue * q , bool use_memdelay )
1778+ {
1779+ if (unlikely (current -> flags & PF_KTHREAD ))
1780+ return ;
1781+
1782+ if (!blk_get_queue (q ))
1783+ return ;
1784+
1785+ if (current -> throttle_queue )
1786+ blk_put_queue (current -> throttle_queue );
1787+ current -> throttle_queue = q ;
1788+ if (use_memdelay )
1789+ current -> use_memdelay = use_memdelay ;
1790+ set_notify_resume (current );
1791+ }
1792+ EXPORT_SYMBOL_GPL (blkcg_schedule_throttle );
1793+
1794+ /**
1795+ * blkcg_add_delay - add delay to this blkg
1796+ * @now - the current time in nanoseconds
1797+ * @delta - how many nanoseconds of delay to add
1798+ *
1799+ * Charge @delta to the blkg's current delay accumulation. This is used to
1800+ * throttle tasks if an IO controller thinks we need more throttling.
1801+ */
1802+ void blkcg_add_delay (struct blkcg_gq * blkg , u64 now , u64 delta )
1803+ {
1804+ blkcg_scale_delay (blkg , now );
1805+ atomic64_add (delta , & blkg -> delay_nsec );
1806+ }
1807+ EXPORT_SYMBOL_GPL (blkcg_add_delay );
1808+
15891809module_param (blkcg_debug_stats , bool , 0644 );
15901810MODULE_PARM_DESC (blkcg_debug_stats , "True if you want debug stats, false if not" );
0 commit comments