Skip to content

Commit ec16227

Browse files
Andy Groverdavem330
authored andcommitted
RDS/IB: Infiniband transport
Registers as an RDS transport and an IB client, and uses IB CM API to allocate ids, queue pairs, and the rest of that fun stuff. Signed-off-by: Andy Grover <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent eff5f53 commit ec16227

File tree

3 files changed

+1416
-0
lines changed

3 files changed

+1416
-0
lines changed

net/rds/ib.c

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
/*
2+
* Copyright (c) 2006 Oracle. All rights reserved.
3+
*
4+
* This software is available to you under a choice of one of two
5+
* licenses. You may choose to be licensed under the terms of the GNU
6+
* General Public License (GPL) Version 2, available from the file
7+
* COPYING in the main directory of this source tree, or the
8+
* OpenIB.org BSD license below:
9+
*
10+
* Redistribution and use in source and binary forms, with or
11+
* without modification, are permitted provided that the following
12+
* conditions are met:
13+
*
14+
* - Redistributions of source code must retain the above
15+
* copyright notice, this list of conditions and the following
16+
* disclaimer.
17+
*
18+
* - Redistributions in binary form must reproduce the above
19+
* copyright notice, this list of conditions and the following
20+
* disclaimer in the documentation and/or other materials
21+
* provided with the distribution.
22+
*
23+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25+
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27+
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28+
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29+
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30+
* SOFTWARE.
31+
*
32+
*/
33+
#include <linux/kernel.h>
34+
#include <linux/in.h>
35+
#include <linux/if.h>
36+
#include <linux/netdevice.h>
37+
#include <linux/inetdevice.h>
38+
#include <linux/if_arp.h>
39+
#include <linux/delay.h>
40+
41+
#include "rds.h"
42+
#include "ib.h"
43+
44+
unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
45+
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
46+
47+
module_param(fmr_pool_size, int, 0444);
48+
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
49+
module_param(fmr_message_size, int, 0444);
50+
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
51+
52+
struct list_head rds_ib_devices;
53+
54+
DEFINE_SPINLOCK(ib_nodev_conns_lock);
55+
LIST_HEAD(ib_nodev_conns);
56+
57+
void rds_ib_add_one(struct ib_device *device)
58+
{
59+
struct rds_ib_device *rds_ibdev;
60+
struct ib_device_attr *dev_attr;
61+
62+
/* Only handle IB (no iWARP) devices */
63+
if (device->node_type != RDMA_NODE_IB_CA)
64+
return;
65+
66+
dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
67+
if (!dev_attr)
68+
return;
69+
70+
if (ib_query_device(device, dev_attr)) {
71+
rdsdebug("Query device failed for %s\n", device->name);
72+
goto free_attr;
73+
}
74+
75+
rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
76+
if (!rds_ibdev)
77+
goto free_attr;
78+
79+
spin_lock_init(&rds_ibdev->spinlock);
80+
81+
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
82+
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
83+
84+
rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
85+
rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
86+
rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
87+
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
88+
rds_ibdev->max_fmrs = dev_attr->max_fmr ?
89+
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
90+
fmr_pool_size;
91+
92+
rds_ibdev->dev = device;
93+
rds_ibdev->pd = ib_alloc_pd(device);
94+
if (IS_ERR(rds_ibdev->pd))
95+
goto free_dev;
96+
97+
rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
98+
IB_ACCESS_LOCAL_WRITE);
99+
if (IS_ERR(rds_ibdev->mr))
100+
goto err_pd;
101+
102+
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
103+
if (IS_ERR(rds_ibdev->mr_pool)) {
104+
rds_ibdev->mr_pool = NULL;
105+
goto err_mr;
106+
}
107+
108+
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
109+
INIT_LIST_HEAD(&rds_ibdev->conn_list);
110+
list_add_tail(&rds_ibdev->list, &rds_ib_devices);
111+
112+
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
113+
114+
goto free_attr;
115+
116+
err_mr:
117+
ib_dereg_mr(rds_ibdev->mr);
118+
err_pd:
119+
ib_dealloc_pd(rds_ibdev->pd);
120+
free_dev:
121+
kfree(rds_ibdev);
122+
free_attr:
123+
kfree(dev_attr);
124+
}
125+
126+
void rds_ib_remove_one(struct ib_device *device)
127+
{
128+
struct rds_ib_device *rds_ibdev;
129+
struct rds_ib_ipaddr *i_ipaddr, *i_next;
130+
131+
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
132+
if (!rds_ibdev)
133+
return;
134+
135+
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
136+
list_del(&i_ipaddr->list);
137+
kfree(i_ipaddr);
138+
}
139+
140+
rds_ib_remove_conns(rds_ibdev);
141+
142+
if (rds_ibdev->mr_pool)
143+
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
144+
145+
ib_dereg_mr(rds_ibdev->mr);
146+
147+
while (ib_dealloc_pd(rds_ibdev->pd)) {
148+
rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
149+
msleep(1);
150+
}
151+
152+
list_del(&rds_ibdev->list);
153+
kfree(rds_ibdev);
154+
}
155+
156+
struct ib_client rds_ib_client = {
157+
.name = "rds_ib",
158+
.add = rds_ib_add_one,
159+
.remove = rds_ib_remove_one
160+
};
161+
162+
static int rds_ib_conn_info_visitor(struct rds_connection *conn,
163+
void *buffer)
164+
{
165+
struct rds_info_rdma_connection *iinfo = buffer;
166+
struct rds_ib_connection *ic;
167+
168+
/* We will only ever look at IB transports */
169+
if (conn->c_trans != &rds_ib_transport)
170+
return 0;
171+
172+
iinfo->src_addr = conn->c_laddr;
173+
iinfo->dst_addr = conn->c_faddr;
174+
175+
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
176+
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
177+
if (rds_conn_state(conn) == RDS_CONN_UP) {
178+
struct rds_ib_device *rds_ibdev;
179+
struct rdma_dev_addr *dev_addr;
180+
181+
ic = conn->c_transport_data;
182+
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
183+
184+
ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
185+
ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
186+
187+
rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
188+
iinfo->max_send_wr = ic->i_send_ring.w_nr;
189+
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
190+
iinfo->max_send_sge = rds_ibdev->max_sge;
191+
rds_ib_get_mr_info(rds_ibdev, iinfo);
192+
}
193+
return 1;
194+
}
195+
196+
static void rds_ib_ic_info(struct socket *sock, unsigned int len,
197+
struct rds_info_iterator *iter,
198+
struct rds_info_lengths *lens)
199+
{
200+
rds_for_each_conn_info(sock, len, iter, lens,
201+
rds_ib_conn_info_visitor,
202+
sizeof(struct rds_info_rdma_connection));
203+
}
204+
205+
206+
/*
207+
* Early RDS/IB was built to only bind to an address if there is an IPoIB
208+
* device with that address set.
209+
*
210+
* If it were me, I'd advocate for something more flexible. Sending and
211+
* receiving should be device-agnostic. Transports would try and maintain
212+
* connections between peers who have messages queued. Userspace would be
213+
* allowed to influence which paths have priority. We could call userspace
214+
* asserting this policy "routing".
215+
*/
216+
static int rds_ib_laddr_check(__be32 addr)
217+
{
218+
int ret;
219+
struct rdma_cm_id *cm_id;
220+
struct sockaddr_in sin;
221+
222+
/* Create a CMA ID and try to bind it. This catches both
223+
* IB and iWARP capable NICs.
224+
*/
225+
cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
226+
if (!cm_id)
227+
return -EADDRNOTAVAIL;
228+
229+
memset(&sin, 0, sizeof(sin));
230+
sin.sin_family = AF_INET;
231+
sin.sin_addr.s_addr = addr;
232+
233+
/* rdma_bind_addr will only succeed for IB & iWARP devices */
234+
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
235+
/* due to this, we will claim to support iWARP devices unless we
236+
check node_type. */
237+
if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
238+
ret = -EADDRNOTAVAIL;
239+
240+
rdsdebug("addr %pI4 ret %d node type %d\n",
241+
&addr, ret,
242+
cm_id->device ? cm_id->device->node_type : -1);
243+
244+
rdma_destroy_id(cm_id);
245+
246+
return ret;
247+
}
248+
249+
void rds_ib_exit(void)
250+
{
251+
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
252+
rds_ib_remove_nodev_conns();
253+
ib_unregister_client(&rds_ib_client);
254+
rds_ib_sysctl_exit();
255+
rds_ib_recv_exit();
256+
rds_trans_unregister(&rds_ib_transport);
257+
}
258+
259+
struct rds_transport rds_ib_transport = {
260+
.laddr_check = rds_ib_laddr_check,
261+
.xmit_complete = rds_ib_xmit_complete,
262+
.xmit = rds_ib_xmit,
263+
.xmit_cong_map = NULL,
264+
.xmit_rdma = rds_ib_xmit_rdma,
265+
.recv = rds_ib_recv,
266+
.conn_alloc = rds_ib_conn_alloc,
267+
.conn_free = rds_ib_conn_free,
268+
.conn_connect = rds_ib_conn_connect,
269+
.conn_shutdown = rds_ib_conn_shutdown,
270+
.inc_copy_to_user = rds_ib_inc_copy_to_user,
271+
.inc_purge = rds_ib_inc_purge,
272+
.inc_free = rds_ib_inc_free,
273+
.cm_initiate_connect = rds_ib_cm_initiate_connect,
274+
.cm_handle_connect = rds_ib_cm_handle_connect,
275+
.cm_connect_complete = rds_ib_cm_connect_complete,
276+
.stats_info_copy = rds_ib_stats_info_copy,
277+
.exit = rds_ib_exit,
278+
.get_mr = rds_ib_get_mr,
279+
.sync_mr = rds_ib_sync_mr,
280+
.free_mr = rds_ib_free_mr,
281+
.flush_mrs = rds_ib_flush_mrs,
282+
.t_owner = THIS_MODULE,
283+
.t_name = "infiniband",
284+
};
285+
286+
int __init rds_ib_init(void)
287+
{
288+
int ret;
289+
290+
INIT_LIST_HEAD(&rds_ib_devices);
291+
292+
ret = ib_register_client(&rds_ib_client);
293+
if (ret)
294+
goto out;
295+
296+
ret = rds_ib_sysctl_init();
297+
if (ret)
298+
goto out_ibreg;
299+
300+
ret = rds_ib_recv_init();
301+
if (ret)
302+
goto out_sysctl;
303+
304+
ret = rds_trans_register(&rds_ib_transport);
305+
if (ret)
306+
goto out_recv;
307+
308+
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
309+
310+
goto out;
311+
312+
out_recv:
313+
rds_ib_recv_exit();
314+
out_sysctl:
315+
rds_ib_sysctl_exit();
316+
out_ibreg:
317+
ib_unregister_client(&rds_ib_client);
318+
out:
319+
return ret;
320+
}
321+
322+
MODULE_LICENSE("GPL");
323+

0 commit comments

Comments
 (0)