Skip to content

Commit 2a8d8e0

Browse files
committed
Merge branch 'blackhole-device-to-invalidate-dst'
Mahesh Bandewar says: ==================== blackhole device to invalidate dst When we invalidate dst or mark it "dead", we assign 'lo' to dst->dev. First of all this assignment is racy and more over, it has MTU implications. The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP code when dereferencing the dst don't check if the dst is valid or not. TCP when dereferencing a dead-dst while negotiating a new connection, may use dst device which is 'lo' instead of using the correct device. Consider the following scenario: A SYN arrives on an interface and tcp-layer while processing SYNACK finds a dst and associates it with SYNACK skb. Now before skb gets passed to L3 for processing, if that dst gets "dead" (because of the virtual device getting disappeared & then reappeared), the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume the SYN has ADV_MSS set as 9k while the output device through which this SYNACK is going to go out has standard MTU of 1500. The MTU check during the route check passes since MIN(9K, 64K) is 9k and TCP successfully negotiates 9k MSS. The subsequent data packet; bigger in size gets passed to the device and it won't be marked as GSO since the assumed MTU of the device is 9k. This either crashes the NIC and we have seen fixes that went into drivers to handle this scenario. 8914a59 ('bnx2x: disable GSO where gso_size is too big for hardware') and 2b16f04 ('net: create skb_gso_validate_mac_len()') and with those fixes TCP eventually recovers but not before few dropped segments. Well, I'm not a TCP expert and though we have experienced these corner cases in our environment, I could not reproduce this case reliably in my test setup to try this fix myself. However, Michael Chan <[email protected]> had a setup where these fixes helped him mitigate the issue and not cause the crash. The idea here is to not alter the data-path with additional locks or smb()/rmb() barriers to avoid racy assignments but to create a new device that has really low MTU that has .ndo_start_xmit essentially a kfree_skb(). Make use of this device instead of 'lo' when marking the dst dead. First patch implements the blackhole device and second patch uses it in IPv4 and IPv6 stack while the third patch is the self test that ensures the sanity of this device. v1->v2 fixed the self-test patch to handle the conflict v2 -> v3 fixed Kconfig text/string. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 8909783 + 509e56b commit 2a8d8e0

File tree

11 files changed

+195
-14
lines changed

11 files changed

+195
-14
lines changed

drivers/net/loopback.c

Lines changed: 67 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@
5555
#include <net/net_namespace.h>
5656
#include <linux/u64_stats_sync.h>
5757

58+
/* blackhole_netdev - a device used for dsts that are marked expired!
59+
* This is global device (instead of per-net-ns) since it's not needed
60+
* to be per-ns and gets initialized at boot time.
61+
*/
62+
struct net_device *blackhole_netdev;
63+
EXPORT_SYMBOL(blackhole_netdev);
64+
5865
/* The higher levels take care of making this non-reentrant (it's
5966
* called with bh's disabled).
6067
*/
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
150157
.ndo_set_mac_address = eth_mac_addr,
151158
};
152159

153-
/* The loopback device is special. There is only one instance
154-
* per network namespace.
155-
*/
156-
static void loopback_setup(struct net_device *dev)
160+
static void gen_lo_setup(struct net_device *dev,
161+
unsigned int mtu,
162+
const struct ethtool_ops *eth_ops,
163+
const struct header_ops *hdr_ops,
164+
const struct net_device_ops *dev_ops,
165+
void (*dev_destructor)(struct net_device *dev))
157166
{
158-
dev->mtu = 64 * 1024;
167+
dev->mtu = mtu;
159168
dev->hard_header_len = ETH_HLEN; /* 14 */
160169
dev->min_header_len = ETH_HLEN; /* 14 */
161170
dev->addr_len = ETH_ALEN; /* 6 */
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
174183
| NETIF_F_NETNS_LOCAL
175184
| NETIF_F_VLAN_CHALLENGED
176185
| NETIF_F_LOOPBACK;
177-
dev->ethtool_ops = &loopback_ethtool_ops;
178-
dev->header_ops = &eth_header_ops;
179-
dev->netdev_ops = &loopback_ops;
186+
dev->ethtool_ops = eth_ops;
187+
dev->header_ops = hdr_ops;
188+
dev->netdev_ops = dev_ops;
180189
dev->needs_free_netdev = true;
181-
dev->priv_destructor = loopback_dev_free;
190+
dev->priv_destructor = dev_destructor;
191+
}
192+
193+
/* The loopback device is special. There is only one instance
194+
* per network namespace.
195+
*/
196+
static void loopback_setup(struct net_device *dev)
197+
{
198+
gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops,
199+
&loopback_ops, loopback_dev_free);
182200
}
183201

184202
/* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net)
213231
struct pernet_operations __net_initdata loopback_net_ops = {
214232
.init = loopback_net_init,
215233
};
234+
235+
/* blackhole netdevice */
236+
static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
237+
struct net_device *dev)
238+
{
239+
kfree_skb(skb);
240+
net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
241+
return NETDEV_TX_OK;
242+
}
243+
244+
static const struct net_device_ops blackhole_netdev_ops = {
245+
.ndo_start_xmit = blackhole_netdev_xmit,
246+
};
247+
248+
/* This is a dst-dummy device used specifically for invalidated
249+
* DSTs and unlike loopback, this is not per-ns.
250+
*/
251+
static void blackhole_netdev_setup(struct net_device *dev)
252+
{
253+
gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
254+
}
255+
256+
/* Setup and register the blackhole_netdev. */
257+
static int __init blackhole_netdev_init(void)
258+
{
259+
blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
260+
blackhole_netdev_setup);
261+
if (!blackhole_netdev)
262+
return -ENOMEM;
263+
264+
dev_init_scheduler(blackhole_netdev);
265+
dev_activate(blackhole_netdev);
266+
267+
blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
268+
dev_net_set(blackhole_netdev, &init_net);
269+
270+
return 0;
271+
}
272+
273+
device_initcall(blackhole_netdev_init);

include/linux/netdevice.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4870,4 +4870,6 @@ do { \
48704870
#define PTYPE_HASH_SIZE (16)
48714871
#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
48724872

4873+
extern struct net_device *blackhole_netdev;
4874+
48734875
#endif /* _LINUX_NETDEVICE_H */

lib/Kconfig.debug

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,6 +1909,15 @@ config TEST_BPF
19091909

19101910
If unsure, say N.
19111911

1912+
config TEST_BLACKHOLE_DEV
1913+
tristate "Test blackhole netdev functionality"
1914+
depends on m && NET
1915+
help
1916+
This builds the "test_blackhole_dev" module that validates the
1917+
data path through this blackhole netdev.
1918+
1919+
If unsure, say N.
1920+
19121921
config FIND_BIT_BENCHMARK
19131922
tristate "Test find_bit functions"
19141923
help

lib/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
9191
obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
9292
obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
9393
obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o
94+
obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
9495

9596
obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
9697

lib/test_blackhole_dev.c

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* This module tests the blackhole_dev that is created during the
4+
* net subsystem initialization. The test this module performs is
5+
* by injecting an skb into the stack with skb->dev as the
6+
* blackhole_dev and expects kernel to behave in a sane manner
7+
* (in other words, *not crash*)!
8+
*
9+
* Copyright (c) 2018, Mahesh Bandewar <[email protected]>
10+
*/
11+
12+
#include <linux/init.h>
13+
#include <linux/module.h>
14+
#include <linux/printk.h>
15+
#include <linux/skbuff.h>
16+
#include <linux/netdevice.h>
17+
#include <linux/udp.h>
18+
#include <linux/ipv6.h>
19+
20+
#include <net/dst.h>
21+
22+
#define SKB_SIZE 256
23+
#define HEAD_SIZE (14+40+8) /* Ether + IPv6 + UDP */
24+
#define TAIL_SIZE 32 /* random tail-room */
25+
26+
#define UDP_PORT 1234
27+
28+
static int __init test_blackholedev_init(void)
29+
{
30+
struct ipv6hdr *ip6h;
31+
struct sk_buff *skb;
32+
struct ethhdr *ethh;
33+
struct udphdr *uh;
34+
int data_len;
35+
int ret;
36+
37+
skb = alloc_skb(SKB_SIZE, GFP_KERNEL);
38+
if (!skb)
39+
return -ENOMEM;
40+
41+
/* Reserve head-room for the headers */
42+
skb_reserve(skb, HEAD_SIZE);
43+
44+
/* Add data to the skb */
45+
data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE);
46+
memset(__skb_put(skb, data_len), 0xf, data_len);
47+
48+
/* Add protocol data */
49+
/* (Transport) UDP */
50+
uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr));
51+
skb_set_transport_header(skb, 0);
52+
uh->source = uh->dest = htons(UDP_PORT);
53+
uh->len = htons(data_len);
54+
uh->check = 0;
55+
/* (Network) IPv6 */
56+
ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr));
57+
skb_set_network_header(skb, 0);
58+
ip6h->hop_limit = 32;
59+
ip6h->payload_len = data_len + sizeof(struct udphdr);
60+
ip6h->nexthdr = IPPROTO_UDP;
61+
ip6h->saddr = in6addr_loopback;
62+
ip6h->daddr = in6addr_loopback;
63+
/* Ether */
64+
ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr));
65+
skb_set_mac_header(skb, 0);
66+
67+
skb->protocol = htons(ETH_P_IPV6);
68+
skb->pkt_type = PACKET_HOST;
69+
skb->dev = blackhole_netdev;
70+
71+
/* Now attempt to send the packet */
72+
ret = dev_queue_xmit(skb);
73+
74+
switch (ret) {
75+
case NET_XMIT_SUCCESS:
76+
pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n");
77+
break;
78+
case NET_XMIT_DROP:
79+
pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n");
80+
break;
81+
case NET_XMIT_CN:
82+
pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n");
83+
break;
84+
default:
85+
pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret);
86+
}
87+
88+
return 0;
89+
}
90+
91+
static void __exit test_blackholedev_exit(void)
92+
{
93+
pr_warn("test_blackholedev module terminating.\n");
94+
}
95+
96+
module_init(test_blackholedev_init);
97+
module_exit(test_blackholedev_exit);
98+
99+
MODULE_AUTHOR("Mahesh Bandewar <[email protected]>");
100+
MODULE_LICENSE("GPL");

net/core/dst.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst)
160160
dst->ops->ifdown(dst, dev, true);
161161
dst->input = dst_discard;
162162
dst->output = dst_discard_out;
163-
dst->dev = dev_net(dst->dev)->loopback_dev;
163+
dst->dev = blackhole_netdev;
164164
dev_hold(dst->dev);
165165
dev_put(dev);
166166
}

net/ipv4/route.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
15321532

15331533
void rt_flush_dev(struct net_device *dev)
15341534
{
1535-
struct net *net = dev_net(dev);
15361535
struct rtable *rt;
15371536
int cpu;
15381537

@@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev)
15431542
list_for_each_entry(rt, &ul->head, rt_uncached) {
15441543
if (rt->dst.dev != dev)
15451544
continue;
1546-
rt->dst.dev = net->loopback_dev;
1545+
rt->dst.dev = blackhole_netdev;
15471546
dev_hold(rt->dst.dev);
15481547
dev_put(dev);
15491548
}

net/ipv6/route.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
176176
}
177177

178178
if (rt_dev == dev) {
179-
rt->dst.dev = loopback_dev;
179+
rt->dst.dev = blackhole_netdev;
180180
dev_hold(rt->dst.dev);
181181
dev_put(rt_dev);
182182
}

tools/testing/selftests/net/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
55
CFLAGS += -I../../../../usr/include/
66

77
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \
8-
rtnetlink.sh xfrm_policy.sh
8+
rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh
99
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
1010
TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
1111
TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh

tools/testing/selftests/net/config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ CONFIG_NFT_CHAIN_NAT_IPV6=m
2727
CONFIG_NFT_CHAIN_NAT_IPV4=m
2828
CONFIG_NET_SCH_FQ=m
2929
CONFIG_NET_SCH_ETF=m
30+
CONFIG_TEST_BLACKHOLE_DEV=m

0 commit comments

Comments
 (0)