Skip to content

Commit 2195e40

Browse files
liu-song-6kernel-patches-bot
authored andcommitted
bpf: use raw_spin_trylock() for pcpu_freelist_push/pop in NMI
Recent improvements in LOCKDEP highlighted a potential A-A deadlock with pcpu_freelist in NMI: ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi [ 18.984807] ================================ [ 18.984807] WARNING: inconsistent lock state [ 18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted [ 18.984809] -------------------------------- [ 18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180 [ 18.984813] {INITIAL USE} state was registered at: [ 18.984814] lock_acquire+0x175/0x7c0 [ 18.984814] _raw_spin_lock+0x2c/0x40 [ 18.984815] __pcpu_freelist_pop+0xe3/0x180 [ 18.984815] pcpu_freelist_pop+0x31/0x40 [ 18.984816] htab_map_alloc+0xbbf/0xf40 [ 18.984816] __do_sys_bpf+0x5aa/0x3ed0 [ 18.984817] do_syscall_64+0x2d/0x40 [ 18.984818] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 18.984818] irq event stamp: 12 [ ... ] [ 18.984822] other info that might help us debug this: [ 18.984823] Possible unsafe locking scenario: [ 18.984823] [ 18.984824] CPU0 [ 18.984824] ---- [ 18.984824] lock(&head->lock); [ 18.984826] <Interrupt> [ 18.984826] lock(&head->lock); [ 18.984827] [ 18.984828] *** DEADLOCK *** [ 18.984828] [ 18.984829] 2 locks held by test_progs/1990: [ ... ] [ 18.984838] <NMI> [ 18.984838] dump_stack+0x9a/0xd0 [ 18.984839] lock_acquire+0x5c9/0x7c0 [ 18.984839] ? lock_release+0x6f0/0x6f0 [ 18.984840] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984840] _raw_spin_lock+0x2c/0x40 [ 18.984841] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984841] __pcpu_freelist_pop+0xe3/0x180 [ 18.984842] pcpu_freelist_pop+0x17/0x40 [ 18.984842] ? lock_release+0x6f0/0x6f0 [ 18.984843] __bpf_get_stackid+0x534/0xaf0 [ 18.984843] bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350 [ 18.984844] bpf_overflow_handler+0x12f/0x3f0 This is because pcpu_freelist_head.lock is accessed in both NMI and non-NMI context. Fix this issue by using raw_spin_trylock() in NMI. Since NMI interrupts non-NMI context, when NMI context tries to lock the raw_spinlock, non-NMI context of the same cpu may already have locked a lock and is blocked from unlocking the lock. For a system with N cpus, there could be N NMIs at the same time, and they may block N non-NMI raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike _pop(), failing _push() means leaking memory. This issue is more likely to trigger in non-SMP system. Fix this issue with an extra list, pcpu_freelist.extralist. The extralist is primarily used to take _push() when raw_spin_trylock() failed on all the per cpu lists. It should be empty most of the time. The following table summarizes the behavior of pcpu_freelist in NMI and non-NMI: non-NMI pop(): use _lock(); check per cpu lists first; if all per cpu lists are empty, check extralist; if extralist is empty, return NULL. non-NMI push(): use _lock(); only push to per cpu lists. NMI pop(): use _trylock(); check per cpu lists first; if all per cpu lists are locked or empty, check extralist; if extralist is locked or empty, return NULL. NMI push(): use _trylock(); check per cpu lists first; if all per cpu lists are locked; try push to extralist; if extralist is also locked, keep trying on per cpu lists. Reported-by: Alexei Starovoitov <[email protected]> Signed-off-by: Song Liu <[email protected]>
1 parent b58a7a6 commit 2195e40

File tree

2 files changed

+97
-5
lines changed

2 files changed

+97
-5
lines changed

kernel/bpf/percpu_freelist.c

Lines changed: 96 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
1717
raw_spin_lock_init(&head->lock);
1818
head->first = NULL;
1919
}
20+
raw_spin_lock_init(&s->extralist.lock);
21+
s->extralist.first = NULL;
2022
return 0;
2123
}
2224

@@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
4042
raw_spin_unlock(&head->lock);
4143
}
4244

45+
static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
46+
struct pcpu_freelist_node *node)
47+
{
48+
if (!raw_spin_trylock(&s->extralist.lock))
49+
return false;
50+
51+
pcpu_freelist_push_node(&s->extralist, node);
52+
raw_spin_unlock(&s->extralist.lock);
53+
return true;
54+
}
55+
56+
static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
57+
struct pcpu_freelist_node *node)
58+
{
59+
int cpu, orig_cpu;
60+
61+
orig_cpu = cpu = raw_smp_processor_id();
62+
while (1) {
63+
struct pcpu_freelist_head *head;
64+
65+
head = per_cpu_ptr(s->freelist, cpu);
66+
if (raw_spin_trylock(&head->lock)) {
67+
pcpu_freelist_push_node(head, node);
68+
raw_spin_unlock(&head->lock);
69+
return;
70+
}
71+
cpu = cpumask_next(cpu, cpu_possible_mask);
72+
if (cpu >= nr_cpu_ids)
73+
cpu = 0;
74+
75+
/* cannot lock any per cpu lock, try extralist */
76+
if (cpu == orig_cpu &&
77+
pcpu_freelist_try_push_extra(s, node))
78+
return;
79+
}
80+
}
81+
4382
void __pcpu_freelist_push(struct pcpu_freelist *s,
4483
struct pcpu_freelist_node *node)
4584
{
46-
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
47-
48-
___pcpu_freelist_push(head, node);
85+
if (in_nmi())
86+
___pcpu_freelist_push_nmi(s, node);
87+
else
88+
___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
4989
}
5090

5191
void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
81121
}
82122
}
83123

84-
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
124+
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
85125
{
86126
struct pcpu_freelist_head *head;
87127
struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
102142
if (cpu >= nr_cpu_ids)
103143
cpu = 0;
104144
if (cpu == orig_cpu)
105-
return NULL;
145+
break;
146+
}
147+
148+
/* per cpu lists are all empty, try extralist */
149+
raw_spin_lock(&s->extralist.lock);
150+
node = s->extralist.first;
151+
if (node)
152+
s->extralist.first = node->next;
153+
raw_spin_unlock(&s->extralist.lock);
154+
return node;
155+
}
156+
157+
static struct pcpu_freelist_node *
158+
___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
159+
{
160+
struct pcpu_freelist_head *head;
161+
struct pcpu_freelist_node *node;
162+
int orig_cpu, cpu;
163+
164+
orig_cpu = cpu = raw_smp_processor_id();
165+
while (1) {
166+
head = per_cpu_ptr(s->freelist, cpu);
167+
if (raw_spin_trylock(&head->lock)) {
168+
node = head->first;
169+
if (node) {
170+
head->first = node->next;
171+
raw_spin_unlock(&head->lock);
172+
return node;
173+
}
174+
raw_spin_unlock(&head->lock);
175+
}
176+
cpu = cpumask_next(cpu, cpu_possible_mask);
177+
if (cpu >= nr_cpu_ids)
178+
cpu = 0;
179+
if (cpu == orig_cpu)
180+
break;
106181
}
182+
183+
/* cannot pop from per cpu lists, try extralist */
184+
if (!raw_spin_trylock(&s->extralist.lock))
185+
return NULL;
186+
node = s->extralist.first;
187+
if (node)
188+
s->extralist.first = node->next;
189+
raw_spin_unlock(&s->extralist.lock);
190+
return node;
191+
}
192+
193+
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
194+
{
195+
if (in_nmi())
196+
return ___pcpu_freelist_pop_nmi(s);
197+
return ___pcpu_freelist_pop(s);
107198
}
108199

109200
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)

kernel/bpf/percpu_freelist.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
1313

1414
struct pcpu_freelist {
1515
struct pcpu_freelist_head __percpu *freelist;
16+
struct pcpu_freelist_head extralist;
1617
};
1718

1819
struct pcpu_freelist_node {

0 commit comments

Comments
 (0)