Skip to content

Commit ff61a4a

Browse files
author
Paolo Abeni
committed
Merge branch 'ip-improve-tcp-sock-multipath-routing'
Willem de Bruijn says: ==================== ip: improve tcp sock multipath routing From: Willem de Bruijn <[email protected]> Improve layer 4 multipath hash policy for local tcp connections: patch 1: Select a source address that matches the nexthop device. Due to tcp_v4_connect making separate route lookups for saddr and route, the two can currently be inconsistent. patch 2: Use all paths when opening multiple local tcp connections to the same ip address and port. patch 3: Test the behavior. Extend the fib_tests.sh testsuite with one opening many connections, and count SYNs on both egress devices, for packets matching the source address of the dev. Changelog in the individual patches ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Paolo Abeni <[email protected]>
2 parents 0d15a26 + 4d0dac4 commit ff61a4a

File tree

9 files changed

+197
-23
lines changed

9 files changed

+197
-23
lines changed

include/net/flow.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ struct flowi_common {
3939
#define FLOWI_FLAG_ANYSRC 0x01
4040
#define FLOWI_FLAG_KNOWN_NH 0x02
4141
#define FLOWI_FLAG_L3MDEV_OIF 0x04
42+
#define FLOWI_FLAG_ANY_SPORT 0x08
4243
__u32 flowic_secid;
4344
kuid_t flowic_uid;
4445
__u32 flowic_multipath_hash;

include/net/ip_fib.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
574574

575575
int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
576576
struct netlink_ext_ack *extack);
577-
void fib_select_multipath(struct fib_result *res, int hash);
577+
void fib_select_multipath(struct fib_result *res, int hash,
578+
const struct flowi4 *fl4);
578579
void fib_select_path(struct net *net, struct fib_result *res,
579580
struct flowi4 *fl4, const struct sk_buff *skb);
580581

include/net/route.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
326326
if (inet_test_bit(TRANSPARENT, sk))
327327
flow_flags |= FLOWI_FLAG_ANYSRC;
328328

329+
if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport)
330+
flow_flags |= FLOWI_FLAG_ANY_SPORT;
331+
329332
flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
330333
ip_sock_rt_scope(sk), protocol, flow_flags, dst,
331334
src, dport, sport, sk->sk_uid);

net/ipv4/fib_semantics.c

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh)
21702170
return !!(state & NUD_VALID);
21712171
}
21722172

2173-
void fib_select_multipath(struct fib_result *res, int hash)
2173+
void fib_select_multipath(struct fib_result *res, int hash,
2174+
const struct flowi4 *fl4)
21742175
{
21752176
struct fib_info *fi = res->fi;
21762177
struct net *net = fi->fib_net;
2177-
bool first = false;
2178+
bool found = false;
2179+
bool use_neigh;
2180+
__be32 saddr;
21782181

21792182
if (unlikely(res->fi->nh)) {
21802183
nexthop_path_fib_result(res, hash);
21812184
return;
21822185
}
21832186

2187+
use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
2188+
saddr = fl4 ? fl4->saddr : 0;
2189+
21842190
change_nexthops(fi) {
2185-
if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
2186-
if (!fib_good_nh(nexthop_nh))
2187-
continue;
2188-
if (!first) {
2189-
res->nh_sel = nhsel;
2190-
res->nhc = &nexthop_nh->nh_common;
2191-
first = true;
2192-
}
2191+
if (use_neigh && !fib_good_nh(nexthop_nh))
2192+
continue;
2193+
2194+
if (!found) {
2195+
res->nh_sel = nhsel;
2196+
res->nhc = &nexthop_nh->nh_common;
2197+
found = !saddr || nexthop_nh->nh_saddr == saddr;
21932198
}
21942199

21952200
if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
21962201
continue;
21972202

2198-
res->nh_sel = nhsel;
2199-
res->nhc = &nexthop_nh->nh_common;
2200-
return;
2203+
if (!saddr || nexthop_nh->nh_saddr == saddr) {
2204+
res->nh_sel = nhsel;
2205+
res->nhc = &nexthop_nh->nh_common;
2206+
return;
2207+
}
2208+
2209+
if (found)
2210+
return;
2211+
22012212
} endfor_nexthops(fi);
22022213
}
22032214
#endif
@@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
22122223
if (fib_info_num_path(res->fi) > 1) {
22132224
int h = fib_multipath_hash(net, fl4, skb, NULL);
22142225

2215-
fib_select_multipath(res, h);
2226+
fib_select_multipath(res, h, fl4);
22162227
}
22172228
else
22182229
#endif

net/ipv4/route.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net,
20372037
hash_keys.addrs.v4addrs.dst = fl4->daddr;
20382038
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
20392039
hash_keys.basic.ip_proto = fl4->flowi4_proto;
2040-
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2041-
hash_keys.ports.src = fl4->fl4_sport;
2040+
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
2041+
if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
2042+
hash_keys.ports.src = (__force __be16)get_random_u16();
2043+
else
2044+
hash_keys.ports.src = fl4->fl4_sport;
2045+
}
20422046
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
20432047
hash_keys.ports.dst = fl4->fl4_dport;
20442048

@@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
20932097
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
20942098
hash_keys.addrs.v4addrs.src = fl4->saddr;
20952099
hash_keys.addrs.v4addrs.dst = fl4->daddr;
2096-
hash_keys.ports.src = fl4->fl4_sport;
2100+
if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
2101+
hash_keys.ports.src = (__force __be16)get_random_u16();
2102+
else
2103+
hash_keys.ports.src = fl4->fl4_sport;
20972104
hash_keys.ports.dst = fl4->fl4_dport;
20982105
hash_keys.basic.ip_proto = fl4->flowi4_proto;
20992106
}
@@ -2154,7 +2161,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
21542161
if (res->fi && fib_info_num_path(res->fi) > 1) {
21552162
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
21562163

2157-
fib_select_multipath(res, h);
2164+
fib_select_multipath(res, h, NULL);
21582165
IPCB(skb)->flags |= IPSKB_MULTIPATH;
21592166
}
21602167
#endif

net/ipv6/route.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
24922492
hash_keys.basic.ip_proto = fl6->flowi6_proto;
24932493
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
24942494
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2495-
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2496-
hash_keys.ports.src = fl6->fl6_sport;
2495+
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
2496+
if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
2497+
hash_keys.ports.src = (__force __be16)get_random_u16();
2498+
else
2499+
hash_keys.ports.src = fl6->fl6_sport;
2500+
}
24972501
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
24982502
hash_keys.ports.dst = fl6->fl6_dport;
24992503

@@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
25472551
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
25482552
hash_keys.addrs.v6addrs.src = fl6->saddr;
25492553
hash_keys.addrs.v6addrs.dst = fl6->daddr;
2550-
hash_keys.ports.src = fl6->fl6_sport;
2554+
if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
2555+
hash_keys.ports.src = (__force __be16)get_random_u16();
2556+
else
2557+
hash_keys.ports.src = fl6->fl6_sport;
25512558
hash_keys.ports.dst = fl6->fl6_dport;
25522559
hash_keys.basic.ip_proto = fl6->flowi6_proto;
25532560
}

net/ipv6/tcp_ipv6.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
267267
fl6.flowi6_mark = sk->sk_mark;
268268
fl6.fl6_dport = usin->sin6_port;
269269
fl6.fl6_sport = inet->inet_sport;
270+
if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
271+
fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
270272
fl6.flowi6_uid = sk->sk_uid;
271273

272274
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));

tools/testing/selftests/net/fib_tests.sh

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
1111
ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \
1212
ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
1313
ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
14-
ipv4_mpath_list ipv6_mpath_list"
14+
ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance"
1515

1616
VERBOSE=0
1717
PAUSE_ON_FAIL=no
@@ -1085,6 +1085,35 @@ route_setup()
10851085
set +e
10861086
}
10871087

1088+
forwarding_cleanup()
1089+
{
1090+
cleanup_ns $ns3
1091+
1092+
route_cleanup
1093+
}
1094+
1095+
# extend route_setup with an ns3 reachable through ns2 over both devices
1096+
forwarding_setup()
1097+
{
1098+
forwarding_cleanup
1099+
1100+
route_setup
1101+
1102+
setup_ns ns3
1103+
1104+
ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2
1105+
ip -netns $ns3 link set veth5 up
1106+
ip -netns $ns2 link set veth6 up
1107+
1108+
ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24
1109+
ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24
1110+
ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2
1111+
1112+
ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad
1113+
ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad
1114+
ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2
1115+
}
1116+
10881117
# assumption is that basic add of a single path route works
10891118
# otherwise just adding an address on an interface is broken
10901119
ipv6_rt_add()
@@ -2600,6 +2629,93 @@ ipv6_mpath_list_test()
26002629
route_cleanup
26012630
}
26022631

2632+
tc_set_flower_counter__saddr_syn() {
2633+
tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2"
2634+
}
2635+
2636+
ip_mpath_balance_dep_check()
2637+
{
2638+
if [ ! -x "$(command -v socat)" ]; then
2639+
echo "socat command not found. Skipping test"
2640+
return 1
2641+
fi
2642+
2643+
if [ ! -x "$(command -v jq)" ]; then
2644+
echo "jq command not found. Skipping test"
2645+
return 1
2646+
fi
2647+
}
2648+
2649+
ip_mpath_balance() {
2650+
local -r ipver=$1
2651+
local -r daddr=$2
2652+
local -r num_conn=20
2653+
2654+
for i in $(seq 1 $num_conn); do
2655+
ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null &
2656+
sleep 0.02
2657+
echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000
2658+
done
2659+
2660+
local -r syn0="$(tc_get_flower_counter $ns1 veth1)"
2661+
local -r syn1="$(tc_get_flower_counter $ns1 veth3)"
2662+
local -r syns=$((syn0+syn1))
2663+
2664+
[ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)"
2665+
2666+
[[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]]
2667+
}
2668+
2669+
ipv4_mpath_balance_test()
2670+
{
2671+
echo
2672+
echo "IPv4 multipath load balance test"
2673+
2674+
ip_mpath_balance_dep_check || return 1
2675+
forwarding_setup
2676+
2677+
$IP route add 172.16.105.1 \
2678+
nexthop via 172.16.101.2 \
2679+
nexthop via 172.16.103.2
2680+
2681+
ip netns exec $ns1 \
2682+
sysctl -q -w net.ipv4.fib_multipath_hash_policy=1
2683+
2684+
tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1
2685+
tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1
2686+
2687+
ip_mpath_balance -4 172.16.105.1
2688+
2689+
log_test $? 0 "IPv4 multipath loadbalance"
2690+
2691+
forwarding_cleanup
2692+
}
2693+
2694+
ipv6_mpath_balance_test()
2695+
{
2696+
echo
2697+
echo "IPv6 multipath load balance test"
2698+
2699+
ip_mpath_balance_dep_check || return 1
2700+
forwarding_setup
2701+
2702+
$IP route add 2001:db8:105::1\
2703+
nexthop via 2001:db8:101::2 \
2704+
nexthop via 2001:db8:103::2
2705+
2706+
ip netns exec $ns1 \
2707+
sysctl -q -w net.ipv6.fib_multipath_hash_policy=1
2708+
2709+
tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1
2710+
tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1
2711+
2712+
ip_mpath_balance -6 "[2001:db8:105::1]"
2713+
2714+
log_test $? 0 "IPv6 multipath loadbalance"
2715+
2716+
forwarding_cleanup
2717+
}
2718+
26032719
################################################################################
26042720
# usage
26052721

@@ -2683,6 +2799,8 @@ do
26832799
fib6_gc_test|ipv6_gc) fib6_gc_test;;
26842800
ipv4_mpath_list) ipv4_mpath_list_test;;
26852801
ipv6_mpath_list) ipv6_mpath_list_test;;
2802+
ipv4_mpath_balance) ipv4_mpath_balance_test;;
2803+
ipv6_mpath_balance) ipv6_mpath_balance_test;;
26862804

26872805
help) echo "Test names: $TESTS"; exit 0;;
26882806
esac

tools/testing/selftests/net/lib.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,30 @@ tc_rule_handle_stats_get()
270270
.options.actions[0].stats$selector"
271271
}
272272

273+
# attach a qdisc with two children match/no-match and a flower filter to match
274+
tc_set_flower_counter() {
275+
local -r ns=$1
276+
local -r ipver=$2
277+
local -r dev=$3
278+
local -r flower_expr=$4
279+
280+
tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \
281+
priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
282+
283+
tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo
284+
tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo
285+
286+
tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \
287+
flower $flower_expr classid 1:2
288+
}
289+
290+
tc_get_flower_counter() {
291+
local -r ns=$1
292+
local -r dev=$2
293+
294+
tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets
295+
}
296+
273297
ret_set_ksft_status()
274298
{
275299
local ksft_status=$1; shift

0 commit comments

Comments
 (0)