Skip to content

Commit 1d24eb4

Browse files
Tom Herbertdavem330
authored andcommitted
xps: Transmit Packet Steering
This patch implements transmit packet steering (XPS) for multiqueue devices. XPS selects a transmit queue during packet transmission based on configuration. This is done by mapping the CPU transmitting the packet to a queue. This is the transmit side analogue to RPS-- where RPS is selecting a CPU based on receive queue, XPS selects a queue based on the CPU (previously there was an XPS patch from Eric Dumazet, but that might more appropriately be called transmit completion steering). Each transmit queue can be associated with a number of CPUs which will use the queue to send packets. This is configured as a CPU mask on a per queue basis in: /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus The mappings are stored per device in an inverted data structure that maps CPUs to queues. In the netdevice structure this is an array of num_possible_cpu structures where each structure holds and array of queue_indexes for queues which that CPU can use. The benefits of XPS are improved locality in the per queue data structures. Also, transmit completions are more likely to be done nearer to the sending thread, so this should promote locality back to the socket on free (e.g. UDP). The benefits of XPS are dependent on cache hierarchy, application load, and other factors. XPS would nominally be configured so that a queue would only be shared by CPUs which are sharing a cache, the degenerative configuration woud be that each CPU has it's own queue. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. bnx2x on 16 core AMD XPS (16 queues, 1 TX queue per CPU) 1234K at 100% CPU No XPS (16 queues) 996K at 100% CPU Signed-off-by: Tom Herbert <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 3853b58 commit 1d24eb4

File tree

4 files changed

+447
-8
lines changed

4 files changed

+447
-8
lines changed

include/linux/netdevice.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,10 @@ struct netdev_queue {
503503
struct Qdisc *qdisc;
504504
unsigned long state;
505505
struct Qdisc *qdisc_sleeping;
506+
#ifdef CONFIG_RPS
507+
struct kobject kobj;
508+
#endif
509+
506510
/*
507511
* write mostly part
508512
*/
@@ -529,6 +533,30 @@ struct rps_map {
529533
};
530534
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
531535

536+
/*
537+
* This structure holds an XPS map which can be of variable length. The
538+
* map is an array of queues.
539+
*/
540+
struct xps_map {
541+
unsigned int len;
542+
unsigned int alloc_len;
543+
struct rcu_head rcu;
544+
u16 queues[0];
545+
};
546+
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16)))
547+
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \
548+
/ sizeof(u16))
549+
550+
/*
551+
* This structure holds all XPS maps for device. Maps are indexed by CPU.
552+
*/
553+
struct xps_dev_maps {
554+
struct rcu_head rcu;
555+
struct xps_map *cpu_map[0];
556+
};
557+
#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \
558+
(nr_cpu_ids * sizeof(struct xps_map *)))
559+
532560
/*
533561
* The rps_dev_flow structure contains the mapping of a flow to a CPU and the
534562
* tail pointer for that CPU's input queue at the time of last enqueue.
@@ -1016,6 +1044,8 @@ struct net_device {
10161044
unsigned long tx_queue_len; /* Max frames per queue allowed */
10171045
spinlock_t tx_global_lock;
10181046

1047+
struct xps_dev_maps *xps_maps;
1048+
10191049
/* These may be needed for future network-power-down code. */
10201050

10211051
/*

net/core/dev.c

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,12 +1557,16 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
15571557
*/
15581558
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
15591559
{
1560+
int rc;
1561+
15601562
if (txq < 1 || txq > dev->num_tx_queues)
15611563
return -EINVAL;
15621564

15631565
if (dev->reg_state == NETREG_REGISTERED) {
15641566
ASSERT_RTNL();
15651567

1568+
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1569+
txq);
15661570
if (txq < dev->real_num_tx_queues)
15671571
qdisc_reset_all_tx_gt(dev, txq);
15681572
}
@@ -2142,6 +2146,44 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
21422146
return queue_index;
21432147
}
21442148

2149+
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2150+
{
2151+
#ifdef CONFIG_RPS
2152+
struct xps_dev_maps *dev_maps;
2153+
struct xps_map *map;
2154+
int queue_index = -1;
2155+
2156+
rcu_read_lock();
2157+
dev_maps = rcu_dereference(dev->xps_maps);
2158+
if (dev_maps) {
2159+
map = rcu_dereference(
2160+
dev_maps->cpu_map[raw_smp_processor_id()]);
2161+
if (map) {
2162+
if (map->len == 1)
2163+
queue_index = map->queues[0];
2164+
else {
2165+
u32 hash;
2166+
if (skb->sk && skb->sk->sk_hash)
2167+
hash = skb->sk->sk_hash;
2168+
else
2169+
hash = (__force u16) skb->protocol ^
2170+
skb->rxhash;
2171+
hash = jhash_1word(hash, hashrnd);
2172+
queue_index = map->queues[
2173+
((u64)hash * map->len) >> 32];
2174+
}
2175+
if (unlikely(queue_index >= dev->real_num_tx_queues))
2176+
queue_index = -1;
2177+
}
2178+
}
2179+
rcu_read_unlock();
2180+
2181+
return queue_index;
2182+
#else
2183+
return -1;
2184+
#endif
2185+
}
2186+
21452187
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
21462188
struct sk_buff *skb)
21472189
{
@@ -2161,7 +2203,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
21612203
queue_index >= dev->real_num_tx_queues) {
21622204
int old_index = queue_index;
21632205

2164-
queue_index = skb_tx_hash(dev, skb);
2206+
queue_index = get_xps_queue(dev, skb);
2207+
if (queue_index < 0)
2208+
queue_index = skb_tx_hash(dev, skb);
21652209

21662210
if (queue_index != old_index && sk) {
21672211
struct dst_entry *dst =
@@ -5066,6 +5110,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
50665110
{
50675111
unsigned int count = dev->num_tx_queues;
50685112
struct netdev_queue *tx;
5113+
int i;
50695114

50705115
BUG_ON(count < 1);
50715116

@@ -5076,15 +5121,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
50765121
return -ENOMEM;
50775122
}
50785123
dev->_tx = tx;
5124+
5125+
for (i = 0; i < count; i++)
5126+
tx[i].dev = dev;
5127+
50795128
return 0;
50805129
}
50815130

50825131
static void netdev_init_one_queue(struct net_device *dev,
50835132
struct netdev_queue *queue,
50845133
void *_unused)
50855134
{
5086-
queue->dev = dev;
5087-
50885135
/* Initialize queue lock */
50895136
spin_lock_init(&queue->_xmit_lock);
50905137
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);

0 commit comments

Comments
 (0)