Skip to content

Commit d5a42de

Browse files
joannekoongkuba-moo
authored andcommitted
net: Add a second bind table hashed by port and address
We currently have one tcp bind table (bhash) which hashes by port number only. In the socket bind path, we check for bind conflicts by traversing the specified port's inet_bind2_bucket while holding the bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()). In instances where there are tons of sockets hashed to the same port at different addresses, checking for a bind conflict is time-intensive and can cause softirq cpu lockups, as well as stops new tcp connections since __inet_inherit_port() also contests for the spinlock. This patch proposes adding a second bind table, bhash2, that hashes by port and ip address. Searching the bhash2 table leads to significantly faster conflict resolution and less time holding the spinlock. Signed-off-by: Joanne Koong <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Acked-by: Kuniyuki Iwashima <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent eac67d8 commit d5a42de

File tree

7 files changed

+489
-83
lines changed

7 files changed

+489
-83
lines changed

include/net/inet_connection_sock.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#undef INET_CSK_CLEAR_TIMERS
2626

2727
struct inet_bind_bucket;
28+
struct inet_bind2_bucket;
2829
struct tcp_congestion_ops;
2930

3031
/*
@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
5758
*
5859
* @icsk_accept_queue: FIFO of established children
5960
* @icsk_bind_hash: Bind node
61+
* @icsk_bind2_hash: Bind node in the bhash2 table
6062
* @icsk_timeout: Timeout
6163
* @icsk_retransmit_timer: Resend (no ack)
6264
* @icsk_rto: Retransmit timeout
@@ -83,6 +85,7 @@ struct inet_connection_sock {
8385
struct inet_sock icsk_inet;
8486
struct request_sock_queue icsk_accept_queue;
8587
struct inet_bind_bucket *icsk_bind_hash;
88+
struct inet_bind2_bucket *icsk_bind2_hash;
8689
unsigned long icsk_timeout;
8790
struct timer_list icsk_retransmit_timer;
8891
struct timer_list icsk_delack_timer;

include/net/inet_hashtables.h

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,32 @@ struct inet_bind_bucket {
9090
struct hlist_head owners;
9191
};
9292

93+
struct inet_bind2_bucket {
94+
possible_net_t ib_net;
95+
int l3mdev;
96+
unsigned short port;
97+
union {
98+
#if IS_ENABLED(CONFIG_IPV6)
99+
struct in6_addr v6_rcv_saddr;
100+
#endif
101+
__be32 rcv_saddr;
102+
};
103+
/* Node in the inet2_bind_hashbucket chain */
104+
struct hlist_node node;
105+
/* List of sockets hashed to this bucket */
106+
struct hlist_head owners;
107+
};
108+
93109
static inline struct net *ib_net(struct inet_bind_bucket *ib)
94110
{
95111
return read_pnet(&ib->ib_net);
96112
}
97113

114+
static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
115+
{
116+
return read_pnet(&ib->ib_net);
117+
}
118+
98119
#define inet_bind_bucket_for_each(tb, head) \
99120
hlist_for_each_entry(tb, head, node)
100121

@@ -103,6 +124,15 @@ struct inet_bind_hashbucket {
103124
struct hlist_head chain;
104125
};
105126

127+
/* This is synchronized using the inet_bind_hashbucket's spinlock.
128+
* Instead of having separate spinlocks, the inet_bind2_hashbucket can share
129+
* the inet_bind_hashbucket's given that in every case where the bhash2 table
130+
* is useful, a lookup in the bhash table also occurs.
131+
*/
132+
struct inet_bind2_hashbucket {
133+
struct hlist_head chain;
134+
};
135+
106136
/* Sockets can be hashed in established or listening table.
107137
* We must use different 'nulls' end-of-chain value for all hash buckets :
108138
* A socket might transition from ESTABLISH to LISTEN state without
@@ -134,6 +164,12 @@ struct inet_hashinfo {
134164
*/
135165
struct kmem_cache *bind_bucket_cachep;
136166
struct inet_bind_hashbucket *bhash;
167+
/* The 2nd binding table hashed by port and address.
168+
* This is used primarily for expediting the resolution of bind
169+
* conflicts.
170+
*/
171+
struct kmem_cache *bind2_bucket_cachep;
172+
struct inet_bind2_hashbucket *bhash2;
137173
unsigned int bhash_size;
138174

139175
/* The 2nd listener table hashed by local port and address */
@@ -193,14 +229,44 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
193229
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
194230
struct inet_bind_bucket *tb);
195231

232+
static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
233+
struct net *net,
234+
const unsigned short port,
235+
int l3mdev)
236+
{
237+
return net_eq(ib_net(tb), net) && tb->port == port &&
238+
tb->l3mdev == l3mdev;
239+
}
240+
241+
struct inet_bind2_bucket *
242+
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
243+
struct inet_bind2_hashbucket *head,
244+
const unsigned short port, int l3mdev,
245+
const struct sock *sk);
246+
247+
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
248+
struct inet_bind2_bucket *tb);
249+
250+
struct inet_bind2_bucket *
251+
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
252+
const unsigned short port, int l3mdev,
253+
struct sock *sk,
254+
struct inet_bind2_hashbucket **head);
255+
256+
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
257+
struct net *net,
258+
const unsigned short port,
259+
int l3mdev,
260+
const struct sock *sk);
261+
196262
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
197263
const u32 bhash_size)
198264
{
199265
return (lport + net_hash_mix(net)) & (bhash_size - 1);
200266
}
201267

202268
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
203-
const unsigned short snum);
269+
struct inet_bind2_bucket *tb2, const unsigned short snum);
204270

205271
/* Caller must disable local BH processing. */
206272
int __inet_inherit_port(const struct sock *sk, struct sock *child);

include/net/sock.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ struct sk_filter;
348348
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
349349
* @sk_txtime_unused: unused txtime flags
350350
* @ns_tracker: tracker for netns reference
351+
* @sk_bind2_node: bind node in the bhash2 table
351352
*/
352353
struct sock {
353354
/*
@@ -537,6 +538,7 @@ struct sock {
537538
#endif
538539
struct rcu_head sk_rcu;
539540
netns_tracker ns_tracker;
541+
struct hlist_node sk_bind2_node;
540542
};
541543

542544
enum sk_pacing {
@@ -817,6 +819,16 @@ static inline void sk_add_bind_node(struct sock *sk,
817819
hlist_add_head(&sk->sk_bind_node, list);
818820
}
819821

822+
static inline void __sk_del_bind2_node(struct sock *sk)
823+
{
824+
__hlist_del(&sk->sk_bind2_node);
825+
}
826+
827+
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
828+
{
829+
hlist_add_head(&sk->sk_bind2_node, list);
830+
}
831+
820832
#define sk_for_each(__sk, list) \
821833
hlist_for_each_entry(__sk, list, sk_node)
822834
#define sk_for_each_rcu(__sk, list) \
@@ -834,6 +846,8 @@ static inline void sk_add_bind_node(struct sock *sk,
834846
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
835847
#define sk_for_each_bound(__sk, list) \
836848
hlist_for_each_entry(__sk, list, sk_bind_node)
849+
#define sk_for_each_bound_bhash2(__sk, list) \
850+
hlist_for_each_entry(__sk, list, sk_bind2_node)
837851

838852
/**
839853
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset

net/dccp/proto.c

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
11201120
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
11211121
if (!dccp_hashinfo.bind_bucket_cachep)
11221122
goto out_free_hashinfo2;
1123+
dccp_hashinfo.bind2_bucket_cachep =
1124+
kmem_cache_create("dccp_bind2_bucket",
1125+
sizeof(struct inet_bind2_bucket), 0,
1126+
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
1127+
if (!dccp_hashinfo.bind2_bucket_cachep)
1128+
goto out_free_bind_bucket_cachep;
11231129

11241130
/*
11251131
* Size and allocate the main established and bind bucket
@@ -1150,7 +1156,7 @@ static int __init dccp_init(void)
11501156

11511157
if (!dccp_hashinfo.ehash) {
11521158
DCCP_CRIT("Failed to allocate DCCP established hash table");
1153-
goto out_free_bind_bucket_cachep;
1159+
goto out_free_bind2_bucket_cachep;
11541160
}
11551161

11561162
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
@@ -1176,14 +1182,23 @@ static int __init dccp_init(void)
11761182
goto out_free_dccp_locks;
11771183
}
11781184

1185+
dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *)
1186+
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
1187+
1188+
if (!dccp_hashinfo.bhash2) {
1189+
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
1190+
goto out_free_dccp_bhash;
1191+
}
1192+
11791193
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
11801194
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
11811195
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1196+
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
11821197
}
11831198

11841199
rc = dccp_mib_init();
11851200
if (rc)
1186-
goto out_free_dccp_bhash;
1201+
goto out_free_dccp_bhash2;
11871202

11881203
rc = dccp_ackvec_init();
11891204
if (rc)
@@ -1207,30 +1222,38 @@ static int __init dccp_init(void)
12071222
dccp_ackvec_exit();
12081223
out_free_dccp_mib:
12091224
dccp_mib_exit();
1225+
out_free_dccp_bhash2:
1226+
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
12101227
out_free_dccp_bhash:
12111228
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
12121229
out_free_dccp_locks:
12131230
inet_ehash_locks_free(&dccp_hashinfo);
12141231
out_free_dccp_ehash:
12151232
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1233+
out_free_bind2_bucket_cachep:
1234+
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
12161235
out_free_bind_bucket_cachep:
12171236
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
12181237
out_free_hashinfo2:
12191238
inet_hashinfo2_free_mod(&dccp_hashinfo);
12201239
out_fail:
12211240
dccp_hashinfo.bhash = NULL;
1241+
dccp_hashinfo.bhash2 = NULL;
12221242
dccp_hashinfo.ehash = NULL;
12231243
dccp_hashinfo.bind_bucket_cachep = NULL;
1244+
dccp_hashinfo.bind2_bucket_cachep = NULL;
12241245
return rc;
12251246
}
12261247

12271248
static void __exit dccp_fini(void)
12281249
{
1250+
int bhash_order = get_order(dccp_hashinfo.bhash_size *
1251+
sizeof(struct inet_bind_hashbucket));
1252+
12291253
ccid_cleanup_builtins();
12301254
dccp_mib_exit();
1231-
free_pages((unsigned long)dccp_hashinfo.bhash,
1232-
get_order(dccp_hashinfo.bhash_size *
1233-
sizeof(struct inet_bind_hashbucket)));
1255+
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1256+
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
12341257
free_pages((unsigned long)dccp_hashinfo.ehash,
12351258
get_order((dccp_hashinfo.ehash_mask + 1) *
12361259
sizeof(struct inet_ehash_bucket)));

0 commit comments

Comments
 (0)