Skip to content

Commit f57bf3f

Browse files
tzanussiherbertx
authored andcommitted
crypto: iaa - Add per-cpu workqueue table with rebalancing
The iaa compression/decompression algorithms in later patches need a way to retrieve an appropriate IAA workqueue depending on how close the associated IAA device is to the current cpu. For this purpose, add a per-cpu array of workqueues such that an appropriate workqueue can be retrieved by simply accessing the per-cpu array. Whenever a new workqueue is bound to or unbound from the iaa_crypto driver, the available workqueues are 'rebalanced' such that work submitted from a particular CPU is given to the most appropriate workqueue available. There currently isn't any way for the user to tweak the way this is done internally - if necessary, knobs can be added later for that purpose. Current best practice is to configure and bind at least one workqueue for each IAA device, but as long as there is at least one workqueue configured and bound to any IAA device in the system, the iaa_crypto driver will work, albeit most likely not as efficiently. [ Based on work originally by George Powley, Jing Lin and Kyung Min Park ] Signed-off-by: Tom Zanussi <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent ea7a5cb commit f57bf3f

File tree

2 files changed

+229
-0
lines changed

2 files changed

+229
-0
lines changed

drivers/crypto/intel/iaa/iaa_crypto.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,11 @@ struct iaa_device {
2727
struct list_head wqs;
2828
};
2929

30+
struct wq_table_entry {
31+
struct idxd_wq **wqs;
32+
int max_wqs;
33+
int n_wqs;
34+
int cur_wq;
35+
};
36+
3037
#endif

drivers/crypto/intel/iaa/iaa_crypto_main.c

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,46 @@
2222

2323
/* number of iaa instances probed */
2424
static unsigned int nr_iaa;
25+
static unsigned int nr_cpus;
26+
static unsigned int nr_nodes;
27+
static unsigned int nr_cpus_per_node;
28+
29+
/* Number of physical cpus sharing each iaa instance */
30+
static unsigned int cpus_per_iaa;
31+
32+
/* Per-cpu lookup table for balanced wqs */
33+
static struct wq_table_entry __percpu *wq_table;
34+
35+
static void wq_table_add(int cpu, struct idxd_wq *wq)
36+
{
37+
struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
38+
39+
if (WARN_ON(entry->n_wqs == entry->max_wqs))
40+
return;
41+
42+
entry->wqs[entry->n_wqs++] = wq;
43+
44+
pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
45+
entry->wqs[entry->n_wqs - 1]->idxd->id,
46+
entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
47+
}
48+
49+
static void wq_table_free_entry(int cpu)
50+
{
51+
struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
52+
53+
kfree(entry->wqs);
54+
memset(entry, 0, sizeof(*entry));
55+
}
56+
57+
static void wq_table_clear_entry(int cpu)
58+
{
59+
struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
60+
61+
entry->n_wqs = 0;
62+
entry->cur_wq = 0;
63+
memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
64+
}
2565

2666
static LIST_HEAD(iaa_devices);
2767
static DEFINE_MUTEX(iaa_devices_lock);
@@ -141,6 +181,53 @@ static void del_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
141181
}
142182
}
143183

184+
static void clear_wq_table(void)
185+
{
186+
int cpu;
187+
188+
for (cpu = 0; cpu < nr_cpus; cpu++)
189+
wq_table_clear_entry(cpu);
190+
191+
pr_debug("cleared wq table\n");
192+
}
193+
194+
static void free_wq_table(void)
195+
{
196+
int cpu;
197+
198+
for (cpu = 0; cpu < nr_cpus; cpu++)
199+
wq_table_free_entry(cpu);
200+
201+
free_percpu(wq_table);
202+
203+
pr_debug("freed wq table\n");
204+
}
205+
206+
static int alloc_wq_table(int max_wqs)
207+
{
208+
struct wq_table_entry *entry;
209+
int cpu;
210+
211+
wq_table = alloc_percpu(struct wq_table_entry);
212+
if (!wq_table)
213+
return -ENOMEM;
214+
215+
for (cpu = 0; cpu < nr_cpus; cpu++) {
216+
entry = per_cpu_ptr(wq_table, cpu);
217+
entry->wqs = kcalloc(max_wqs, sizeof(struct wq *), GFP_KERNEL);
218+
if (!entry->wqs) {
219+
free_wq_table();
220+
return -ENOMEM;
221+
}
222+
223+
entry->max_wqs = max_wqs;
224+
}
225+
226+
pr_debug("initialized wq table\n");
227+
228+
return 0;
229+
}
230+
144231
static int save_iaa_wq(struct idxd_wq *wq)
145232
{
146233
struct iaa_device *iaa_device, *found = NULL;
@@ -193,6 +280,8 @@ static int save_iaa_wq(struct idxd_wq *wq)
193280

194281
if (WARN_ON(nr_iaa == 0))
195282
return -EINVAL;
283+
284+
cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
196285
out:
197286
return 0;
198287
}
@@ -207,6 +296,116 @@ static void remove_iaa_wq(struct idxd_wq *wq)
207296
break;
208297
}
209298
}
299+
300+
if (nr_iaa)
301+
cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
302+
else
303+
cpus_per_iaa = 0;
304+
}
305+
306+
static int wq_table_add_wqs(int iaa, int cpu)
307+
{
308+
struct iaa_device *iaa_device, *found_device = NULL;
309+
int ret = 0, cur_iaa = 0, n_wqs_added = 0;
310+
struct idxd_device *idxd;
311+
struct iaa_wq *iaa_wq;
312+
struct pci_dev *pdev;
313+
struct device *dev;
314+
315+
list_for_each_entry(iaa_device, &iaa_devices, list) {
316+
idxd = iaa_device->idxd;
317+
pdev = idxd->pdev;
318+
dev = &pdev->dev;
319+
320+
if (cur_iaa != iaa) {
321+
cur_iaa++;
322+
continue;
323+
}
324+
325+
found_device = iaa_device;
326+
dev_dbg(dev, "getting wq from iaa_device %d, cur_iaa %d\n",
327+
found_device->idxd->id, cur_iaa);
328+
break;
329+
}
330+
331+
if (!found_device) {
332+
found_device = list_first_entry_or_null(&iaa_devices,
333+
struct iaa_device, list);
334+
if (!found_device) {
335+
pr_debug("couldn't find any iaa devices with wqs!\n");
336+
ret = -EINVAL;
337+
goto out;
338+
}
339+
cur_iaa = 0;
340+
341+
idxd = found_device->idxd;
342+
pdev = idxd->pdev;
343+
dev = &pdev->dev;
344+
dev_dbg(dev, "getting wq from only iaa_device %d, cur_iaa %d\n",
345+
found_device->idxd->id, cur_iaa);
346+
}
347+
348+
list_for_each_entry(iaa_wq, &found_device->wqs, list) {
349+
wq_table_add(cpu, iaa_wq->wq);
350+
pr_debug("rebalance: added wq for cpu=%d: iaa wq %d.%d\n",
351+
cpu, iaa_wq->wq->idxd->id, iaa_wq->wq->id);
352+
n_wqs_added++;
353+
};
354+
355+
if (!n_wqs_added) {
356+
pr_debug("couldn't find any iaa wqs!\n");
357+
ret = -EINVAL;
358+
goto out;
359+
}
360+
out:
361+
return ret;
362+
}
363+
364+
/*
365+
* Rebalance the wq table so that given a cpu, it's easy to find the
366+
* closest IAA instance. The idea is to try to choose the most
367+
* appropriate IAA instance for a caller and spread available
368+
* workqueues around to clients.
369+
*/
370+
static void rebalance_wq_table(void)
371+
{
372+
const struct cpumask *node_cpus;
373+
int node, cpu, iaa = -1;
374+
375+
if (nr_iaa == 0)
376+
return;
377+
378+
pr_debug("rebalance: nr_nodes=%d, nr_cpus %d, nr_iaa %d, cpus_per_iaa %d\n",
379+
nr_nodes, nr_cpus, nr_iaa, cpus_per_iaa);
380+
381+
clear_wq_table();
382+
383+
if (nr_iaa == 1) {
384+
for (cpu = 0; cpu < nr_cpus; cpu++) {
385+
if (WARN_ON(wq_table_add_wqs(0, cpu))) {
386+
pr_debug("could not add any wqs for iaa 0 to cpu %d!\n", cpu);
387+
return;
388+
}
389+
}
390+
391+
return;
392+
}
393+
394+
for_each_online_node(node) {
395+
node_cpus = cpumask_of_node(node);
396+
397+
for (cpu = 0; cpu < nr_cpus_per_node; cpu++) {
398+
int node_cpu = cpumask_nth(cpu, node_cpus);
399+
400+
if ((cpu % cpus_per_iaa) == 0)
401+
iaa++;
402+
403+
if (WARN_ON(wq_table_add_wqs(iaa, node_cpu))) {
404+
pr_debug("could not add any wqs for iaa %d to cpu %d!\n", iaa, cpu);
405+
return;
406+
}
407+
}
408+
}
210409
}
211410

212411
static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
@@ -215,6 +414,7 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
215414
struct idxd_device *idxd = wq->idxd;
216415
struct idxd_driver_data *data = idxd->data;
217416
struct device *dev = &idxd_dev->conf_dev;
417+
bool first_wq = false;
218418
int ret = 0;
219419

220420
if (idxd->state != IDXD_DEV_ENABLED)
@@ -245,17 +445,30 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
245445

246446
mutex_lock(&iaa_devices_lock);
247447

448+
if (list_empty(&iaa_devices)) {
449+
ret = alloc_wq_table(wq->idxd->max_wqs);
450+
if (ret)
451+
goto err_alloc;
452+
first_wq = true;
453+
}
454+
248455
ret = save_iaa_wq(wq);
249456
if (ret)
250457
goto err_save;
251458

459+
rebalance_wq_table();
460+
252461
mutex_unlock(&iaa_devices_lock);
253462
out:
254463
mutex_unlock(&wq->wq_lock);
255464

256465
return ret;
257466

258467
err_save:
468+
if (first_wq)
469+
free_wq_table();
470+
err_alloc:
471+
mutex_unlock(&iaa_devices_lock);
259472
idxd_drv_disable_wq(wq);
260473
err:
261474
wq->type = IDXD_WQT_NONE;
@@ -273,7 +486,12 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_dev)
273486
mutex_lock(&iaa_devices_lock);
274487

275488
remove_iaa_wq(wq);
489+
276490
idxd_drv_disable_wq(wq);
491+
rebalance_wq_table();
492+
493+
if (nr_iaa == 0)
494+
free_wq_table();
277495

278496
mutex_unlock(&iaa_devices_lock);
279497
mutex_unlock(&wq->wq_lock);
@@ -295,6 +513,10 @@ static int __init iaa_crypto_init_module(void)
295513
{
296514
int ret = 0;
297515

516+
nr_cpus = num_online_cpus();
517+
nr_nodes = num_online_nodes();
518+
nr_cpus_per_node = nr_cpus / nr_nodes;
519+
298520
ret = idxd_driver_register(&iaa_crypto_driver);
299521
if (ret) {
300522
pr_debug("IAA wq sub-driver registration failed\n");

0 commit comments

Comments
 (0)