Skip to content

Commit e9edc18

Browse files
edumazetummakynes
authored andcommitted
netfilter: conntrack: serialize hash resizes and cleanups
Syzbot was able to trigger the following warning [1] No repro found by syzbot yet but I was able to trigger similar issue by having 2 scripts running in parallel, changing conntrack hash sizes, and: for j in `seq 1 1000` ; do unshare -n /bin/true >/dev/null ; done It would take more than 5 minutes for net_namespace structures to be cleaned up. This is because nf_ct_iterate_cleanup() has to restart everytime a resize happened. By adding a mutex, we can serialize hash resizes and cleanups and also make get_next_corpse() faster by skipping over empty buckets. Even without resizes in the picture, this patch considerably speeds up network namespace dismantles. [1] INFO: task syz-executor.0:8312 can't die for more than 144 seconds. task:syz-executor.0 state:R running task stack:25672 pid: 8312 ppid: 6573 flags:0x00004006 Call Trace: context_switch kernel/sched/core.c:4955 [inline] __schedule+0x940/0x26f0 kernel/sched/core.c:6236 preempt_schedule_common+0x45/0xc0 kernel/sched/core.c:6408 preempt_schedule_thunk+0x16/0x18 arch/x86/entry/thunk_64.S:35 __local_bh_enable_ip+0x109/0x120 kernel/softirq.c:390 local_bh_enable include/linux/bottom_half.h:32 [inline] get_next_corpse net/netfilter/nf_conntrack_core.c:2252 [inline] nf_ct_iterate_cleanup+0x15a/0x450 net/netfilter/nf_conntrack_core.c:2275 nf_conntrack_cleanup_net_list+0x14c/0x4f0 net/netfilter/nf_conntrack_core.c:2469 ops_exit_list+0x10d/0x160 net/core/net_namespace.c:171 setup_net+0x639/0xa30 net/core/net_namespace.c:349 copy_net_ns+0x319/0x760 net/core/net_namespace.c:470 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3128 __do_sys_unshare kernel/fork.c:3202 [inline] __se_sys_unshare kernel/fork.c:3200 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3200 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f63da68e739 RSP: 002b:00007f63d7c05188 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f63da792f80 RCX: 00007f63da68e739 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000000 RBP: 00007f63da6e8cc4 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f63da792f80 R13: 00007fff50b75d3f R14: 00007f63d7c05300 R15: 0000000000022000 Showing all locks held in the system: 1 lock held by khungtaskd/27: #0: ffffffff8b980020 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6446 2 locks held by kworker/u4:2/153: #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic_long_set include/linux/atomic/atomic-long.h:41 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: atomic_long_set include/linux/atomic/atomic-instrumented.h:1198 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:634 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:661 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x896/0x1690 kernel/workqueue.c:2268 #1: ffffc9000140fdb0 ((kfence_timer).work){+.+.}-{0:0}, at: process_one_work+0x8ca/0x1690 kernel/workqueue.c:2272 1 lock held by systemd-udevd/2970: 1 lock held by in:imklog/6258: #0: ffff88807f970ff0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:990 3 locks held by kworker/1:6/8158: 1 lock held by syz-executor.0/8312: 2 locks held by kworker/u4:13/9320: 1 lock held by syz-executor.5/10178: 1 lock held by syz-executor.4/10217: Signed-off-by: Eric Dumazet <[email protected]> Reported-by: syzbot <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent b53deef commit e9edc18

File tree

1 file changed

+37
-33
lines changed

1 file changed

+37
-33
lines changed

net/netfilter/nf_conntrack_core.c

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep;
7474
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
7575
static __read_mostly bool nf_conntrack_locks_all;
7676

77+
/* serialize hash resizes and nf_ct_iterate_cleanup */
78+
static DEFINE_MUTEX(nf_conntrack_mutex);
79+
7780
#define GC_SCAN_INTERVAL (120u * HZ)
7881
#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
7982

@@ -2263,28 +2266,31 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
22632266
spinlock_t *lockp;
22642267

22652268
for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2269+
struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
2270+
2271+
if (hlist_nulls_empty(hslot))
2272+
continue;
2273+
22662274
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
22672275
local_bh_disable();
22682276
nf_conntrack_lock(lockp);
2269-
if (*bucket < nf_conntrack_htable_size) {
2270-
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
2271-
if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2272-
continue;
2273-
/* All nf_conn objects are added to hash table twice, one
2274-
* for original direction tuple, once for the reply tuple.
2275-
*
2276-
* Exception: In the IPS_NAT_CLASH case, only the reply
2277-
* tuple is added (the original tuple already existed for
2278-
* a different object).
2279-
*
2280-
* We only need to call the iterator once for each
2281-
* conntrack, so we just use the 'reply' direction
2282-
* tuple while iterating.
2283-
*/
2284-
ct = nf_ct_tuplehash_to_ctrack(h);
2285-
if (iter(ct, data))
2286-
goto found;
2287-
}
2277+
hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
2278+
if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2279+
continue;
2280+
/* All nf_conn objects are added to hash table twice, one
2281+
* for original direction tuple, once for the reply tuple.
2282+
*
2283+
* Exception: In the IPS_NAT_CLASH case, only the reply
2284+
* tuple is added (the original tuple already existed for
2285+
* a different object).
2286+
*
2287+
* We only need to call the iterator once for each
2288+
* conntrack, so we just use the 'reply' direction
2289+
* tuple while iterating.
2290+
*/
2291+
ct = nf_ct_tuplehash_to_ctrack(h);
2292+
if (iter(ct, data))
2293+
goto found;
22882294
}
22892295
spin_unlock(lockp);
22902296
local_bh_enable();
@@ -2302,26 +2308,20 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
23022308
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
23032309
void *data, u32 portid, int report)
23042310
{
2305-
unsigned int bucket = 0, sequence;
2311+
unsigned int bucket = 0;
23062312
struct nf_conn *ct;
23072313

23082314
might_sleep();
23092315

2310-
for (;;) {
2311-
sequence = read_seqcount_begin(&nf_conntrack_generation);
2312-
2313-
while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2314-
/* Time to push up daises... */
2316+
mutex_lock(&nf_conntrack_mutex);
2317+
while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2318+
/* Time to push up daises... */
23152319

2316-
nf_ct_delete(ct, portid, report);
2317-
nf_ct_put(ct);
2318-
cond_resched();
2319-
}
2320-
2321-
if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
2322-
break;
2323-
bucket = 0;
2320+
nf_ct_delete(ct, portid, report);
2321+
nf_ct_put(ct);
2322+
cond_resched();
23242323
}
2324+
mutex_unlock(&nf_conntrack_mutex);
23252325
}
23262326

23272327
struct iter_data {
@@ -2557,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
25572557
if (!hash)
25582558
return -ENOMEM;
25592559

2560+
mutex_lock(&nf_conntrack_mutex);
25602561
old_size = nf_conntrack_htable_size;
25612562
if (old_size == hashsize) {
2563+
mutex_unlock(&nf_conntrack_mutex);
25622564
kvfree(hash);
25632565
return 0;
25642566
}
@@ -2598,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
25982600
nf_conntrack_all_unlock();
25992601
local_bh_enable();
26002602

2603+
mutex_unlock(&nf_conntrack_mutex);
2604+
26012605
synchronize_net();
26022606
kvfree(old_hash);
26032607
return 0;

0 commit comments

Comments
 (0)