Skip to content

Commit d55262c

Browse files
committed
workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity
Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: Lai Jiangshan <[email protected]>
1 parent 4c16bd3 commit d55262c

File tree

3 files changed

+73
-23
lines changed

3 files changed

+73
-23
lines changed

Documentation/kernel-parameters.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
32223222
or other driver-specific files in the
32233223
Documentation/watchdog/ directory.
32243224

3225+
workqueue.disable_numa
3226+
By default, all work items queued to unbound
3227+
workqueues are affine to the NUMA nodes they're
3228+
issued on, which results in better behavior in
3229+
general. If NUMA affinity needs to be disabled for
3230+
whatever reason, this option can be used. Note
3231+
that this also can be controlled per-workqueue for
3232+
workqueues visible under /sys/bus/workqueue/.
3233+
32253234
x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
32263235
default x2apic cluster mode on platforms
32273236
supporting x2apic.

include/linux/workqueue.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,15 @@ struct delayed_work {
119119
/*
120120
* A struct for workqueue attributes. This can be used to change
121121
* attributes of an unbound workqueue.
122+
*
123+
* Unlike other fields, ->no_numa isn't a property of a worker_pool. It
124+
* only modifies how apply_workqueue_attrs() select pools and thus doesn't
125+
* participate in pool hash calculations or equality comparisons.
122126
*/
123127
struct workqueue_attrs {
124128
int nice; /* nice level */
125129
cpumask_var_t cpumask; /* allowed CPUs */
130+
bool no_numa; /* disable NUMA affinity */
126131
};
127132

128133
static inline struct delayed_work *to_delayed_work(struct work_struct *work)

kernel/workqueue.c

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
268268
static cpumask_var_t *wq_numa_possible_cpumask;
269269
/* possible CPUs of each node */
270270

271+
static bool wq_disable_numa;
272+
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
273+
271274
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
272275

273276
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
516519
return ret;
517520
}
518521

519-
/**
520-
* first_pwq - return the first pool_workqueue of the specified workqueue
521-
* @wq: the target workqueue
522-
*
523-
* This must be called either with wq->mutex held or sched RCU read locked.
524-
* If the pwq needs to be used beyond the locking in effect, the caller is
525-
* responsible for guaranteeing that the pwq stays online.
526-
*/
527-
static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
528-
{
529-
assert_rcu_or_wq_mutex(wq);
530-
return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
531-
pwqs_node);
532-
}
533-
534522
/**
535523
* unbound_pwq_by_node - return the unbound pool_workqueue for the given node
536524
* @wq: the target workqueue
@@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = {
31143102
__ATTR_NULL,
31153103
};
31163104

3117-
static ssize_t wq_pool_id_show(struct device *dev,
3118-
struct device_attribute *attr, char *buf)
3105+
static ssize_t wq_pool_ids_show(struct device *dev,
3106+
struct device_attribute *attr, char *buf)
31193107
{
31203108
struct workqueue_struct *wq = dev_to_wq(dev);
3121-
struct worker_pool *pool;
3122-
int written;
3109+
const char *delim = "";
3110+
int node, written = 0;
31233111

31243112
rcu_read_lock_sched();
3125-
pool = first_pwq(wq)->pool;
3126-
written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
3113+
for_each_node(node) {
3114+
written += scnprintf(buf + written, PAGE_SIZE - written,
3115+
"%s%d:%d", delim, node,
3116+
unbound_pwq_by_node(wq, node)->pool->id);
3117+
delim = " ";
3118+
}
3119+
written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
31273120
rcu_read_unlock_sched();
31283121

31293122
return written;
@@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev,
32123205
return ret ?: count;
32133206
}
32143207

3208+
static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
3209+
char *buf)
3210+
{
3211+
struct workqueue_struct *wq = dev_to_wq(dev);
3212+
int written;
3213+
3214+
mutex_lock(&wq->mutex);
3215+
written = scnprintf(buf, PAGE_SIZE, "%d\n",
3216+
!wq->unbound_attrs->no_numa);
3217+
mutex_unlock(&wq->mutex);
3218+
3219+
return written;
3220+
}
3221+
3222+
static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
3223+
const char *buf, size_t count)
3224+
{
3225+
struct workqueue_struct *wq = dev_to_wq(dev);
3226+
struct workqueue_attrs *attrs;
3227+
int v, ret;
3228+
3229+
attrs = wq_sysfs_prep_attrs(wq);
3230+
if (!attrs)
3231+
return -ENOMEM;
3232+
3233+
ret = -EINVAL;
3234+
if (sscanf(buf, "%d", &v) == 1) {
3235+
attrs->no_numa = !v;
3236+
ret = apply_workqueue_attrs(wq, attrs);
3237+
}
3238+
3239+
free_workqueue_attrs(attrs);
3240+
return ret ?: count;
3241+
}
3242+
32153243
static struct device_attribute wq_sysfs_unbound_attrs[] = {
3216-
__ATTR(pool_id, 0444, wq_pool_id_show, NULL),
3244+
__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
32173245
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
32183246
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
3247+
__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
32193248
__ATTR_NULL,
32203249
};
32213250

@@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
37503779
static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
37513780
int cpu_going_down, cpumask_t *cpumask)
37523781
{
3753-
if (!wq_numa_enabled)
3782+
if (!wq_numa_enabled || attrs->no_numa)
37543783
goto use_dfl;
37553784

37563785
/* does @node have any online CPUs @attrs wants? */
@@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
39513980
cpumask = target_attrs->cpumask;
39523981

39533982
mutex_lock(&wq->mutex);
3983+
if (wq->unbound_attrs->no_numa)
3984+
goto out_unlock;
39543985

39553986
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
39563987
pwq = unbound_pwq_by_node(wq, node);
@@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void)
47634794
if (num_possible_nodes() <= 1)
47644795
return;
47654796

4797+
if (wq_disable_numa) {
4798+
pr_info("workqueue: NUMA affinity support disabled\n");
4799+
return;
4800+
}
4801+
47664802
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
47674803
BUG_ON(!wq_update_unbound_numa_attrs_buf);
47684804

0 commit comments

Comments
 (0)