FD.io VPP  v21.06
Vector Packet Processing
device.c
Go to the documentation of this file.
1 /*
2  *------------------------------------------------------------------
3  * Copyright (c) 2018 Cisco and/or its affiliates.
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *------------------------------------------------------------------
16  */
17 
18 #include <unistd.h>
19 #include <fcntl.h>
20 #include <net/if.h>
21 #include <linux/if_link.h>
22 #include <linux/if_ether.h>
23 
24 #include <vppinfra/linux/sysfs.h>
25 #include <vlib/vlib.h>
26 #include <vlib/unix/unix.h>
27 #include <vlib/pci/pci.h>
28 #include <vnet/ethernet/ethernet.h>
30 
31 #include <rdma/rdma.h>
32 
33 /* Default RSS hash key (from DPDK MLX driver) */
34 static u8 rdma_rss_hash_key[] = {
35  0x2c, 0xc6, 0x81, 0xd1,
36  0x5b, 0xdb, 0xf4, 0xf7,
37  0xfc, 0xa2, 0x83, 0x19,
38  0xdb, 0x1a, 0x3e, 0x94,
39  0x6b, 0x9e, 0x38, 0xd9,
40  0x2c, 0x9c, 0x03, 0xd1,
41  0xad, 0x99, 0x44, 0xa7,
42  0xd9, 0x56, 0x3d, 0x59,
43  0x06, 0x3c, 0x25, 0xf3,
44  0xfc, 0x1f, 0xdc, 0x2a,
45 };
46 
48 
49 /* (dev) is of type (rdma_device_t *) */
50 #define rdma_log__(lvl, dev, f, ...) \
51  do \
52  { \
53  vlib_log ((lvl), rdma_main.log_class, "%s: " f, (dev)->name, \
54  ##__VA_ARGS__); \
55  } \
56  while (0)
57 
58 #define rdma_log(lvl, dev, f, ...) \
59  rdma_log__((lvl), (dev), "%s (%d): " f, strerror(errno), errno, ##__VA_ARGS__)
60 
61 static struct ibv_flow *
62 rdma_rxq_init_flow (const rdma_device_t * rd, struct ibv_qp *qp,
63  const mac_address_t * mac, const mac_address_t * mask,
64  u16 ether_type, u32 flags)
65 {
66  struct ibv_flow *flow;
67  struct raw_eth_flow_attr
68  {
69  struct ibv_flow_attr attr;
70  struct ibv_flow_spec_eth spec_eth;
71  } __attribute__ ((packed)) fa;
72 
73  memset (&fa, 0, sizeof (fa));
74  fa.attr.num_of_specs = 1;
75  fa.attr.port = 1;
76  fa.attr.flags = flags;
77  fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
78  fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
79 
80  memcpy (fa.spec_eth.val.dst_mac, mac, sizeof (fa.spec_eth.val.dst_mac));
81  memcpy (fa.spec_eth.mask.dst_mac, mask, sizeof (fa.spec_eth.mask.dst_mac));
82 
83  if (ether_type)
84  {
85  fa.spec_eth.val.ether_type = ether_type;
86  fa.spec_eth.mask.ether_type = 0xffff;
87  }
88 
89  flow = ibv_create_flow (qp, &fa.attr);
90  if (!flow)
91  rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_create_flow() failed");
92  return flow;
93 }
94 
95 static u32
96 rdma_rxq_destroy_flow (const rdma_device_t * rd, struct ibv_flow **flow)
97 {
98  if (!*flow)
99  return 0;
100 
101  if (ibv_destroy_flow (*flow))
102  {
103  rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_destroy_flow() failed");
104  return ~0;
105  }
106 
107  *flow = 0;
108  return 0;
109 }
110 
111 static u32
113 {
114  const mac_address_t all = {.bytes = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0} };
115  int err;
116 
117  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast6);
118  err |= rdma_rxq_destroy_flow (rd, &rd->flow_ucast6);
119  err |= rdma_rxq_destroy_flow (rd, &rd->flow_mcast4);
120  err |= rdma_rxq_destroy_flow (rd, &rd->flow_ucast4);
121  if (err)
122  return ~0;
123 
124  rd->flow_ucast6 =
125  rdma_rxq_init_flow (rd, rd->rx_qp6, &all, &all, ntohs (ETH_P_IPV6), 0);
126  rd->flow_ucast4 = rdma_rxq_init_flow (rd, rd->rx_qp4, &all, &all, 0, 0);
127  if (!rd->flow_ucast6 || !rd->flow_ucast4)
128  return ~0;
129 
130  rd->flags |= RDMA_DEVICE_F_PROMISC;
131  return 0;
132 }
133 
134 static u32
136 {
137  const mac_address_t ucast = {.bytes = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
138  };
139  const mac_address_t mcast = {.bytes = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0} };
140  int err;
141 
142  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast6);
143  err |= rdma_rxq_destroy_flow (rd, &rd->flow_ucast6);
144  err |= rdma_rxq_destroy_flow (rd, &rd->flow_mcast4);
145  err |= rdma_rxq_destroy_flow (rd, &rd->flow_ucast4);
146  if (err)
147  return ~0;
148 
149  rd->flow_ucast6 =
150  rdma_rxq_init_flow (rd, rd->rx_qp6, &rd->hwaddr, &ucast,
151  ntohs (ETH_P_IPV6), 0);
152  rd->flow_mcast6 =
153  rdma_rxq_init_flow (rd, rd->rx_qp6, &mcast, &mcast, ntohs (ETH_P_IPV6),
154  IBV_FLOW_ATTR_FLAGS_DONT_TRAP
155  /* let others receive mcast packet too (eg. Linux) */
156  );
157  rd->flow_ucast4 =
158  rdma_rxq_init_flow (rd, rd->rx_qp4, &rd->hwaddr, &ucast, 0, 0);
159  rd->flow_mcast4 =
160  rdma_rxq_init_flow (rd, rd->rx_qp4, &mcast, &mcast, 0,
161  IBV_FLOW_ATTR_FLAGS_DONT_TRAP
162  /* let others receive mcast packet too (eg. Linux) */
163  );
164  if (!rd->flow_ucast6 || !rd->flow_mcast6 || !rd->flow_ucast4
165  || !rd->flow_mcast4)
166  return ~0;
167 
168  rd->flags &= ~RDMA_DEVICE_F_PROMISC;
169  return 0;
170 }
171 
172 static clib_error_t *
173 rdma_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new)
174 {
175  rdma_main_t *rm = &rdma_main;
177  mac_address_from_bytes (&rd->hwaddr, new);
178  if (!(rd->flags & RDMA_DEVICE_F_PROMISC) && rdma_dev_set_ucast (rd))
179  {
180  mac_address_from_bytes (&rd->hwaddr, old);
181  return clib_error_return_unix (0, "MAC update failed");
182  }
183  return 0;
184 }
185 
186 static u32
188 {
189  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "MTU change not supported");
190  return ~0;
191 }
192 
193 static u32
195 {
196  rdma_main_t *rm = &rdma_main;
198 
199  switch (flags)
200  {
202  return rdma_dev_set_ucast (rd);
204  return rdma_dev_set_promisc (rd);
206  return rdma_dev_change_mtu (rd);
207  }
208 
209  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unknown flag %x requested", flags);
210  return ~0;
211 }
212 
213 static void
215 {
216  struct ibv_port_attr attr;
217  u32 width = 0;
218  u32 speed = 0;
219 
220  if (ibv_query_port (rd->ctx, port, &attr))
221  {
224  return;
225  }
226 
227  /* update state */
228  switch (attr.state)
229  {
230  case IBV_PORT_ACTIVE: /* fallthrough */
231  case IBV_PORT_ACTIVE_DEFER:
232  rd->flags |= RDMA_DEVICE_F_LINK_UP;
235  break;
236  default:
237  rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
239  break;
240  }
241 
242  /* update speed */
243  switch (attr.active_width)
244  {
245  case 1:
246  width = 1;
247  break;
248  case 2:
249  width = 4;
250  break;
251  case 4:
252  width = 8;
253  break;
254  case 8:
255  width = 12;
256  break;
257  }
258  switch (attr.active_speed)
259  {
260  case 1:
261  speed = 2500000;
262  break;
263  case 2:
264  speed = 5000000;
265  break;
266  case 4: /* fallthrough */
267  case 8:
268  speed = 10000000;
269  break;
270  case 16:
271  speed = 14000000;
272  break;
273  case 32:
274  speed = 25000000;
275  break;
276  }
277  vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
278 }
279 
280 static clib_error_t *
282 {
283  rdma_main_t *rm = &rdma_main;
285  return clib_error_return (0, "RDMA: %s: async event error", rd->name);
286 }
287 
288 static clib_error_t *
290 {
291  vnet_main_t *vnm = vnet_get_main ();
292  rdma_main_t *rm = &rdma_main;
294  int ret;
295  struct ibv_async_event event;
296  ret = ibv_get_async_event (rd->ctx, &event);
297  if (ret < 0)
298  return clib_error_return_unix (0, "ibv_get_async_event() failed");
299 
300  switch (event.event_type)
301  {
302  case IBV_EVENT_PORT_ACTIVE:
303  rdma_update_state (vnm, rd, event.element.port_num);
304  break;
305  case IBV_EVENT_PORT_ERR:
306  rdma_update_state (vnm, rd, event.element.port_num);
307  break;
308  case IBV_EVENT_DEVICE_FATAL:
309  rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
311  vlib_log_emerg (rm->log_class, "%s: fatal error", rd->name);
312  break;
313  default:
314  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unhandeld RDMA async event %d",
315  event.event_type);
316  break;
317  }
318 
319  ibv_ack_async_event (&event);
320  return 0;
321 }
322 
323 static clib_error_t *
325 {
326  clib_file_t t = { 0 };
327  int ret;
328 
329  /* make RDMA async event fd non-blocking */
330  ret = fcntl (rd->ctx->async_fd, F_GETFL);
331  if (ret < 0)
332  return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
333 
334  ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
335  if (ret < 0)
336  return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
337 
338  /* register RDMA async event fd */
340  t.file_descriptor = rd->ctx->async_fd;
342  t.private_data = rd->dev_instance;
343  t.description = format (0, "%v async event", rd->name);
344 
346  return 0;
347 }
348 
349 static void
351 {
353 }
354 
355 static clib_error_t *
357 {
358  clib_error_t *err =
360  rd->dev_instance, rd->hwaddr.bytes,
362 
363  /* Indicate ability to support L3 DMAC filtering and
364  * initialize interface to L3 non-promisc mode */
369  return err;
370 }
371 
372 static void
374 {
377 }
378 
379 static void
381 {
382  rdma_main_t *rm = &rdma_main;
383  rdma_rxq_t *rxq;
384  rdma_txq_t *txq;
385 
386 #define _(fn, arg) if (arg) \
387  { \
388  int rv; \
389  if ((rv = fn (arg))) \
390  rdma_log (VLIB_LOG_LEVEL_DEBUG, rd, #fn "() failed (rv = %d)", rv); \
391  }
392 
393  _(ibv_destroy_flow, rd->flow_mcast6);
394  _(ibv_destroy_flow, rd->flow_ucast6);
395  _(ibv_destroy_flow, rd->flow_mcast4);
396  _(ibv_destroy_flow, rd->flow_ucast4);
397  _(ibv_dereg_mr, rd->mr);
398  vec_foreach (txq, rd->txqs)
399  {
400  _(ibv_destroy_qp, txq->qp);
401  _(ibv_destroy_cq, txq->cq);
402  }
403  vec_foreach (rxq, rd->rxqs)
404  {
405  _(ibv_destroy_wq, rxq->wq);
406  _(ibv_destroy_cq, rxq->cq);
407  }
408  _(ibv_destroy_rwq_ind_table, rd->rx_rwq_ind_tbl);
409  _(ibv_destroy_qp, rd->rx_qp6);
410  _(ibv_destroy_qp, rd->rx_qp4);
411  _(ibv_dealloc_pd, rd->pd);
412  _(ibv_close_device, rd->ctx);
413 #undef _
414 
415  clib_error_free (rd->error);
416 
417  vec_free (rd->rxqs);
418  vec_free (rd->txqs);
419  vec_free (rd->name);
421  pool_put (rm->devices, rd);
422 }
423 
424 static clib_error_t *
426  u8 no_multi_seg, u16 max_pktlen)
427 {
428  rdma_rxq_t *rxq;
429  struct ibv_wq_init_attr wqia;
430  struct ibv_cq_init_attr_ex cqa = { };
431  struct ibv_wq_attr wqa;
432  struct ibv_cq_ex *cqex;
433  struct mlx5dv_wq_init_attr dv_wqia = { };
434  int is_mlx5dv = ! !(rd->flags & RDMA_DEVICE_F_MLX5DV);
435  int is_striding = ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ);
436 
438  rxq = vec_elt_at_index (rd->rxqs, qid);
439  rxq->size = n_desc;
440  rxq->log_wqe_sz = 0;
442  vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
443 
444  cqa.cqe = n_desc;
445  if (is_mlx5dv)
446  {
447  struct mlx5dv_cq_init_attr dvcq = { };
448  dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
449  dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
450 
451  if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0)
452  return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed");
453  }
454  else
455  {
456  if ((cqex = ibv_create_cq_ex (rd->ctx, &cqa)) == 0)
457  return clib_error_return_unix (0, "Create CQ Failed");
458  }
459 
460  rxq->cq = ibv_cq_ex_to_cq (cqex);
461 
462  memset (&wqia, 0, sizeof (wqia));
463  wqia.wq_type = IBV_WQT_RQ;
464  wqia.max_wr = n_desc;
465  wqia.max_sge = 1;
466  wqia.pd = rd->pd;
467  wqia.cq = rxq->cq;
468  if (is_mlx5dv)
469  {
470  if (is_striding)
471  {
472  /* In STRIDING_RQ mode, map a descriptor to a stride, not a full WQE buffer */
473  uword data_seg_log2_sz =
475  rxq->buf_sz = 1 << data_seg_log2_sz;
476  /* The trick is also to map a descriptor to a data segment in the WQE SG list
477  The number of strides per WQE and the size of a WQE (in 16-bytes words) both
478  must be powers of two.
479  Moreover, in striding RQ mode, WQEs must include the SRQ header, which occupies
480  one 16-bytes word. That is why WQEs have 2*RDMA_RXQ_MAX_CHAIN_SZ 16-bytes words:
481  - One for the SRQ Header
482  - RDMA_RXQ_MAX_CHAIN_SZ for the different data segments (each mapped to
483  a stride, and a vlib_buffer)
484  - RDMA_RXQ_MAX_CHAIN_SZ-1 null data segments
485  */
486  int max_chain_log_sz =
487  max_pktlen ? max_log2 ((max_pktlen /
488  (rxq->buf_sz)) +
490  max_chain_log_sz = clib_max (max_chain_log_sz, 3);
491  wqia.max_sge = 1 << max_chain_log_sz;
492  dv_wqia.comp_mask = MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
493  dv_wqia.striding_rq_attrs.two_byte_shift_en = 0;
494  dv_wqia.striding_rq_attrs.single_wqe_log_num_of_strides =
495  max_chain_log_sz;
496  dv_wqia.striding_rq_attrs.single_stride_log_num_of_bytes =
497  data_seg_log2_sz;
498  wqia.max_wr >>= max_chain_log_sz;
499  rxq->log_wqe_sz = max_chain_log_sz + 1;
500  rxq->log_stride_per_wqe = max_chain_log_sz;
501  }
502  else
503  {
504  /* In non STRIDING_RQ mode and if multiseg is not disabled, each WQE is a SG list of data
505  segments, each pointing to a vlib_buffer. */
506  if (no_multi_seg)
507  {
508  wqia.max_sge = 1;
509  rxq->log_wqe_sz = 0;
510  rxq->n_ds_per_wqe = 1;
511  }
512  else
513  {
514  int max_chain_sz =
515  max_pktlen ? (max_pktlen /
516  (rxq->buf_sz)) +
518  int max_chain_log_sz = max_log2 (max_chain_sz);
519  wqia.max_sge = 1 << max_chain_log_sz;
520  rxq->log_wqe_sz = max_chain_log_sz;
521  rxq->n_ds_per_wqe = max_chain_sz;
522  }
523 
524  }
525 
526  if ((rxq->wq = mlx5dv_create_wq (rd->ctx, &wqia, &dv_wqia)))
527  {
528  rxq->wq->events_completed = 0;
529  pthread_mutex_init (&rxq->wq->mutex, NULL);
530  pthread_cond_init (&rxq->wq->cond, NULL);
531  }
532  else
533  return clib_error_return_unix (0, "Create WQ Failed");
534  }
535  else if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0)
536  return clib_error_return_unix (0, "Create WQ Failed");
537 
538  memset (&wqa, 0, sizeof (wqa));
539  wqa.attr_mask = IBV_WQ_ATTR_STATE;
540  wqa.wq_state = IBV_WQS_RDY;
541  if (ibv_modify_wq (rxq->wq, &wqa) != 0)
542  return clib_error_return_unix (0, "Modify WQ (RDY) Failed");
543 
544  if (is_mlx5dv)
545  {
546  struct mlx5dv_obj obj = { };
547  struct mlx5dv_cq dv_cq;
548  struct mlx5dv_rwq dv_rwq;
549  u64 qw0;
550  u64 qw0_nullseg;
551  u32 wqe_sz_mask = (1 << rxq->log_wqe_sz) - 1;
552 
553  obj.cq.in = rxq->cq;
554  obj.cq.out = &dv_cq;
555  obj.rwq.in = rxq->wq;
556  obj.rwq.out = &dv_rwq;
557 
558  if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ)))
559  return clib_error_return_unix (0, "mlx5dv: failed to init rx obj");
560 
561  if (dv_cq.cqe_size != sizeof (mlx5dv_cqe_t))
562  return clib_error_return_unix (0, "mlx5dv: incompatible rx CQE size");
563 
564  rxq->log2_cq_size = max_log2 (dv_cq.cqe_cnt);
565  rxq->cqes = (mlx5dv_cqe_t *) dv_cq.buf;
566  rxq->cq_db = (volatile u32 *) dv_cq.dbrec;
567  rxq->cqn = dv_cq.cqn;
568 
569  rxq->wqes = (mlx5dv_wqe_ds_t *) dv_rwq.buf;
570  rxq->wq_db = (volatile u32 *) dv_rwq.dbrec;
571  rxq->wq_stride = dv_rwq.stride;
572  rxq->wqe_cnt = dv_rwq.wqe_cnt;
573 
574  qw0 = clib_host_to_net_u32 (rxq->buf_sz);
575  qw0_nullseg = 0;
576  qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
577  qw0_nullseg |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
578 
579 /* Prefill the different 16 bytes words of the WQ.
580  - If not in striding RQ mode, for each WQE, init with qw0 the first
581  RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ, and init the rest of the WQE
582  with null segments.
583  - If in striding RQ mode, for each WQE, the RDMA_RXQ_MAX_CHAIN_SZ + 1
584  first 16-bytes words are initialised with qw0, the rest are null segments */
585 
586  for (int i = 0; i < rxq->wqe_cnt << rxq->log_wqe_sz; i++)
587  if ((!is_striding
588  && ((i & wqe_sz_mask) < rxq->n_ds_per_wqe))
589  || (is_striding
590  && ((i == 0)
591  || !(((i - 1) >> rxq->log_stride_per_wqe) & 0x1))))
592  rxq->wqes[i].dsz_and_lkey = qw0;
593  else
594  rxq->wqes[i].dsz_and_lkey = qw0_nullseg;
595 
596  for (int i = 0; i < (1 << rxq->log2_cq_size); i++)
597  rxq->cqes[i].opcode_cqefmt_se_owner = 0xff;
598 
599  if (!is_striding)
600  {
601  vec_validate_aligned (rxq->second_bufs, n_desc - 1,
603  vec_validate_aligned (rxq->n_used_per_chain, n_desc - 1,
605  rxq->n_total_additional_segs = n_desc * (rxq->n_ds_per_wqe - 1);
606  for (int i = 0; i < n_desc; i++)
607  rxq->n_used_per_chain[i] = rxq->n_ds_per_wqe - 1;
608  }
609  }
610 
611  return 0;
612 }
613 
614 static uint64_t
616 {
617  switch (rss4)
618  {
619  case RDMA_RSS4_IP:
620  return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4;
621  case RDMA_RSS4_IP_UDP:
622  return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
623  IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP;
624  case RDMA_RSS4_AUTO: /* fallthrough */
625  case RDMA_RSS4_IP_TCP:
626  return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
627  IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
628  }
629  ASSERT (0);
630  return 0;
631 }
632 
633 static uint64_t
635 {
636  switch (rss6)
637  {
638  case RDMA_RSS6_IP:
639  return IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6;
640  case RDMA_RSS6_IP_UDP:
641  return IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
642  IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP;
643  case RDMA_RSS6_AUTO: /* fallthrough */
644  case RDMA_RSS6_IP_TCP:
645  return IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
646  IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
647  }
648  ASSERT (0);
649  return 0;
650 }
651 
652 static clib_error_t *
654 {
655  struct ibv_rwq_ind_table_init_attr rwqia;
656  struct ibv_qp_init_attr_ex qpia;
657  struct ibv_wq **ind_tbl;
658  const u32 rxq_sz = vec_len (rd->rxqs);
659  u32 ind_tbl_sz = rxq_sz;
660  u32 i;
661 
662  if (!is_pow2 (ind_tbl_sz))
663  {
664  /* in case we do not have a power-of-2 number of rxq, we try to use the
665  * maximum supported to minimize the imbalance */
666  struct ibv_device_attr_ex attr;
667  if (ibv_query_device_ex (rd->ctx, 0, &attr))
668  return clib_error_return_unix (0, "device query failed");
669  ind_tbl_sz = attr.rss_caps.max_rwq_indirection_table_size;
670  if (ind_tbl_sz < rxq_sz)
671  return clib_error_create ("too many rxqs requested (%d) compared to "
672  "max indirection table size (%d)",
673  rxq_sz, ind_tbl_sz);
674  }
675 
676  ind_tbl = vec_new (struct ibv_wq *, ind_tbl_sz);
677  vec_foreach_index (i, ind_tbl)
678  vec_elt (ind_tbl, i) = vec_elt (rd->rxqs, i % rxq_sz).wq;
679  memset (&rwqia, 0, sizeof (rwqia));
680  ASSERT (is_pow2 (vec_len (ind_tbl)));
681  rwqia.log_ind_tbl_size = min_log2 (vec_len (ind_tbl));
682  rwqia.ind_tbl = ind_tbl;
683  if ((rd->rx_rwq_ind_tbl = ibv_create_rwq_ind_table (rd->ctx, &rwqia)) == 0)
684  return clib_error_return_unix (0, "RWQ indirection table create failed");
685  vec_free (ind_tbl);
686 
687  memset (&qpia, 0, sizeof (qpia));
688  qpia.qp_type = IBV_QPT_RAW_PACKET;
689  qpia.comp_mask =
690  IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_IND_TABLE |
691  IBV_QP_INIT_ATTR_RX_HASH;
692  qpia.pd = rd->pd;
693  qpia.rwq_ind_tbl = rd->rx_rwq_ind_tbl;
695  qpia.rx_hash_conf.rx_hash_key_len = sizeof (rdma_rss_hash_key);
696  qpia.rx_hash_conf.rx_hash_key = rdma_rss_hash_key;
697  qpia.rx_hash_conf.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ;
698 
699  qpia.rx_hash_conf.rx_hash_fields_mask = rdma_rss42ibv (rd->rss4);
700  if ((rd->rx_qp4 = ibv_create_qp_ex (rd->ctx, &qpia)) == 0)
701  return clib_error_return_unix (0, "IPv4 Queue Pair create failed");
702 
703  qpia.rx_hash_conf.rx_hash_fields_mask = rdma_rss62ibv (rd->rss6);
704  if ((rd->rx_qp6 = ibv_create_qp_ex (rd->ctx, &qpia)) == 0)
705  return clib_error_return_unix (0, "IPv6 Queue Pair create failed");
706 
707  if (rdma_dev_set_ucast (rd))
708  return clib_error_return_unix (0, "Set unicast mode failed");
709 
710  return 0;
711 }
712 
713 static clib_error_t *
715 {
716  rdma_txq_t *txq;
717  struct ibv_qp_init_attr qpia;
718  struct ibv_qp_attr qpa;
719  int qp_flags;
720 
722  txq = vec_elt_at_index (rd->txqs, qid);
723  ASSERT (is_pow2 (n_desc));
724  txq->bufs_log2sz = min_log2 (n_desc);
725  vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
726 
727  if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
728  return clib_error_return_unix (0, "Create CQ Failed");
729 
730  memset (&qpia, 0, sizeof (qpia));
731  qpia.send_cq = txq->cq;
732  qpia.recv_cq = txq->cq;
733  qpia.cap.max_send_wr = n_desc;
734  qpia.cap.max_send_sge = 1;
735  qpia.qp_type = IBV_QPT_RAW_PACKET;
736 
737  if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
738  return clib_error_return_unix (0, "Queue Pair create failed");
739 
740  memset (&qpa, 0, sizeof (qpa));
741  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
742  qpa.qp_state = IBV_QPS_INIT;
743  qpa.port_num = 1;
744  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
745  return clib_error_return_unix (0, "Modify QP (init) Failed");
746 
747  memset (&qpa, 0, sizeof (qpa));
748  qp_flags = IBV_QP_STATE;
749  qpa.qp_state = IBV_QPS_RTR;
750  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
751  return clib_error_return_unix (0, "Modify QP (receive) Failed");
752 
753  memset (&qpa, 0, sizeof (qpa));
754  qp_flags = IBV_QP_STATE;
755  qpa.qp_state = IBV_QPS_RTS;
756  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
757  return clib_error_return_unix (0, "Modify QP (send) Failed");
758 
759  txq->ibv_cq = txq->cq;
760  txq->ibv_qp = txq->qp;
761 
762  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
763  {
764  rdma_mlx5_wqe_t *tmpl = (void *) txq->dv_wqe_tmpl;
765  struct mlx5dv_cq dv_cq;
766  struct mlx5dv_qp dv_qp;
767  struct mlx5dv_obj obj = { };
768 
769  obj.cq.in = txq->cq;
770  obj.cq.out = &dv_cq;
771  obj.qp.in = txq->qp;
772  obj.qp.out = &dv_qp;
773 
774  if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP))
775  return clib_error_return_unix (0, "DV init obj failed");
776 
777  if (RDMA_TXQ_BUF_SZ (txq) > dv_qp.sq.wqe_cnt
778  || !is_pow2 (dv_qp.sq.wqe_cnt)
779  || sizeof (rdma_mlx5_wqe_t) != dv_qp.sq.stride
780  || (uword) dv_qp.sq.buf % sizeof (rdma_mlx5_wqe_t))
781  return clib_error_return (0, "Unsupported DV SQ parameters");
782 
783  if (RDMA_TXQ_BUF_SZ (txq) > dv_cq.cqe_cnt
784  || !is_pow2 (dv_cq.cqe_cnt)
785  || sizeof (struct mlx5_cqe64) != dv_cq.cqe_size
786  || (uword) dv_cq.buf % sizeof (struct mlx5_cqe64))
787  return clib_error_return (0, "Unsupported DV CQ parameters");
788 
789  /* get SQ and doorbell addresses */
790  txq->dv_sq_wqes = dv_qp.sq.buf;
791  txq->dv_sq_dbrec = dv_qp.dbrec;
792  txq->dv_sq_db = dv_qp.bf.reg;
793  txq->dv_sq_log2sz = min_log2 (dv_qp.sq.wqe_cnt);
794 
795  /* get CQ and doorbell addresses */
796  txq->dv_cq_cqes = dv_cq.buf;
797  txq->dv_cq_dbrec = dv_cq.dbrec;
798  txq->dv_cq_log2sz = min_log2 (dv_cq.cqe_cnt);
799 
800  /* init tx desc template */
801  STATIC_ASSERT_SIZEOF (txq->dv_wqe_tmpl, sizeof (*tmpl));
802  mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0,
803  txq->qp->qp_num, 0, RDMA_MLX5_WQE_DS, 0,
805  tmpl->eseg.inline_hdr_sz = htobe16 (MLX5_ETH_L2_INLINE_HEADER_SIZE);
806  mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->lkey, 0);
807  }
808 
809  return 0;
810 }
811 
812 static clib_error_t *
814  rdma_create_if_args_t * args)
815 {
816  clib_error_t *err;
819  u32 rxq_num = args->rxq_num;
820  u32 rxq_size = args->rxq_size;
821  u32 txq_size = args->txq_size;
822  u32 i;
823 
824  if (rd->ctx == 0)
825  return clib_error_return_unix (0, "Device Open Failed");
826 
827  if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
828  return clib_error_return_unix (0, "PD Alloc Failed");
829 
830  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
831  bm->buffer_mem_size,
832  IBV_ACCESS_LOCAL_WRITE)) == 0)
833  return clib_error_return_unix (0, "Register MR Failed");
834 
835  rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
836 
838 
839  rd->rss4 = args->rss4;
840  rd->rss6 = args->rss6;
841 
842  /*
843  * /!\ WARNING /!\ creation order is important
844  * We *must* create TX queues *before* RX queues, otherwise we will receive
845  * the broacast packets we sent
846  */
847  for (i = 0; i < tm->n_vlib_mains; i++)
848  if ((err = rdma_txq_init (vm, rd, i, txq_size)))
849  return err;
850 
851  for (i = 0; i < rxq_num; i++)
852  if ((err =
853  rdma_rxq_init (vm, rd, i, rxq_size,
854  args->no_multi_seg, args->max_pktlen)))
855  return err;
856  if ((err = rdma_rxq_finalize (vm, rd)))
857  return err;
858 
859  return 0;
860 }
861 
862 static uword
863 sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
864 {
865  uword rv;
866  unformat_input_t in;
867  u8 *s;
868 
869  s = clib_sysfs_link_to_name (path);
870  if (!s)
871  return 0;
872 
873  unformat_init_string (&in, (char *) s, strlen ((char *) s));
874  rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
875  unformat_free (&in);
876  vec_free (s);
877  return rv;
878 }
879 
880 void
882 {
883  vnet_main_t *vnm = vnet_get_main ();
884  rdma_main_t *rm = &rdma_main;
885  rdma_device_t *rd;
886  vlib_pci_addr_t pci_addr;
887  struct ibv_device **dev_list;
888  int n_devs;
889  u8 *s;
890  u16 qid;
891  int i;
892 
893  args->rxq_size = args->rxq_size ? args->rxq_size : 1024;
894  args->txq_size = args->txq_size ? args->txq_size : 1024;
895  args->rxq_num = args->rxq_num ? args->rxq_num : 2;
896 
897  if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
898  args->rxq_size > 65535 || args->txq_size > 65535 ||
899  !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
900  {
901  args->rv = VNET_API_ERROR_INVALID_VALUE;
902  args->error = clib_error_return (0,
903  "queue size must be a power of two "
904  "between %d and 65535",
906  goto err0;
907  }
908 
909  dev_list = ibv_get_device_list (&n_devs);
910  if (n_devs == 0)
911  {
912  args->error =
914  "no RDMA devices available. Is the ib_uverbs module loaded?");
915  goto err0;
916  }
917 
918  /* get PCI address */
919  s = format (0, "/sys/class/net/%s/device%c", args->ifname, 0);
920  if (sysfs_path_to_pci_addr ((char *) s, &pci_addr) == 0)
921  {
922  args->error =
923  clib_error_return (0, "cannot find PCI address for device ");
924  goto err1;
925  }
926 
927  pool_get_zero (rm->devices, rd);
928  rd->dev_instance = rd - rm->devices;
930  rd->linux_ifname = format (0, "%s", args->ifname);
931 
932  if (!args->name || 0 == args->name[0])
933  rd->name = format (0, "%s/%d", args->ifname, rd->dev_instance);
934  else
935  rd->name = format (0, "%s", args->name);
936 
937  rd->pci = vlib_pci_get_device_info (vm, &pci_addr, &args->error);
938  if (!rd->pci)
939  goto err2;
940 
941  /* if we failed to parse NUMA node, default to 0 */
942  if (-1 == rd->pci->numa_node)
943  rd->pci->numa_node = 0;
944 
946 
947  if (strncmp ((char *) rd->pci->driver_name, "mlx5_core", 9))
948  {
949  args->error =
951  "invalid interface (only mlx5 supported for now)");
952  goto err2;
953  }
954 
955  for (i = 0; i < n_devs; i++)
956  {
957  vlib_pci_addr_t addr;
958 
959  vec_reset_length (s);
960  s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
961 
962  if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
963  continue;
964 
965  if (addr.as_u32 != rd->pci->addr.as_u32)
966  continue;
967 
968  if ((rd->ctx = ibv_open_device (dev_list[i])))
969  break;
970  }
971 
972  if (args->mode != RDMA_MODE_IBV)
973  {
974  struct mlx5dv_context mlx5dv_attrs = { };
975  mlx5dv_attrs.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
976 
977  if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0)
978  {
979  uword data_seg_log2_sz =
981 
982  if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1))
983  rd->flags |= RDMA_DEVICE_F_MLX5DV;
984 
985 /* Enable striding RQ if neither multiseg nor striding rq
986 are explicitly disabled, and if the interface supports it.*/
987  if (!args->no_multi_seg && !args->disable_striding_rq
988  && data_seg_log2_sz <=
989  mlx5dv_attrs.striding_rq_caps.max_single_stride_log_num_of_bytes
990  && data_seg_log2_sz >=
991  mlx5dv_attrs.striding_rq_caps.min_single_stride_log_num_of_bytes
993  mlx5dv_attrs.striding_rq_caps.min_single_wqe_log_num_of_strides
995  mlx5dv_attrs.striding_rq_caps.max_single_wqe_log_num_of_strides)
996  rd->flags |= RDMA_DEVICE_F_STRIDING_RQ;
997  }
998  else
999  {
1000  if (args->mode == RDMA_MODE_DV)
1001  {
1002  args->error = clib_error_return (0, "Direct Verbs mode not "
1003  "supported on this interface");
1004  goto err2;
1005  }
1006  }
1007  }
1008 
1009  if ((args->error = rdma_dev_init (vm, rd, args)))
1010  goto err2;
1011 
1012  if ((args->error = rdma_register_interface (vnm, rd)))
1013  goto err2;
1014 
1015  if ((args->error = rdma_async_event_init (rd)))
1016  goto err3;
1017 
1018  rdma_update_state (vnm, rd, 1);
1019 
1021  args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
1022  /*
1023  * FIXME: add support for interrupt mode
1024  * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
1025  * hw->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_INT_MODE;
1026  */
1028 
1029  vec_foreach_index (qid, rd->rxqs)
1030  {
1031  u32 queue_index = vnet_hw_if_register_rx_queue (
1032  vnm, rd->hw_if_index, qid, VNET_HW_IF_RXQ_THREAD_ANY);
1033  rd->rxqs[qid].queue_index = queue_index;
1034  }
1036  vec_free (s);
1037  return;
1038 
1039 err3:
1040  rdma_unregister_interface (vnm, rd);
1041 err2:
1042  rdma_dev_cleanup (rd);
1043 err1:
1044  ibv_free_device_list (dev_list);
1045  vec_free (s);
1046  args->rv = VNET_API_ERROR_INVALID_INTERFACE;
1047 err0:
1048  vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
1049 }
1050 
1051 void
1053 {
1056  rdma_dev_cleanup (rd);
1057 }
1058 
1059 static clib_error_t *
1061 {
1062  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
1063  rdma_main_t *rm = &rdma_main;
1065  uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
1066 
1067  if (rd->flags & RDMA_DEVICE_F_ERROR)
1068  return clib_error_return (0, "device is in error state");
1069 
1070  if (is_up)
1071  {
1074  rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
1075  }
1076  else
1077  {
1079  rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
1080  }
1081  return 0;
1082 }
1083 
1084 static void
1086  u32 node_index)
1087 {
1088  rdma_main_t *rm = &rdma_main;
1089  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
1092  ~0 ==
1094  vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
1095 }
1096 
1097 static char *rdma_tx_func_error_strings[] = {
1098 #define _(n,s) s,
1100 #undef _
1101 };
1102 
1104 {
1105  .name = "RDMA interface",
1106  .format_device = format_rdma_device,
1107  .format_device_name = format_rdma_device_name,
1108  .admin_up_down_function = rdma_interface_admin_up_down,
1109  .rx_redirect_to_node = rdma_set_interface_next_node,
1110  .tx_function_n_errors = RDMA_TX_N_ERROR,
1111  .tx_function_error_strings = rdma_tx_func_error_strings,
1112  .mac_addr_change_function = rdma_mac_change,
1113 };
1114 
1115 clib_error_t *
1117 {
1118  rdma_main_t *rm = &rdma_main;
1120 
1121  rm->log_class = vlib_log_register_class ("rdma", 0);
1122 
1123  /* vlib_buffer_t template */
1126 
1127  for (int i = 0; i < tm->n_vlib_mains; i++)
1128  {
1130  clib_memset (&ptd->buffer_template, 0, sizeof (vlib_buffer_t));
1131  ptd->buffer_template.flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
1132  ptd->buffer_template.ref_count = 1;
1133  vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0;
1134  }
1135 
1136  return 0;
1137 }
1138 
1140 {
1141  .runs_after = VLIB_INITS ("pci_bus_init"),
1142 };
1143 
1144 /*
1145  * fd.io coding-style-patch-verification: ON
1146  *
1147  * Local Variables:
1148  * eval: (c-set-style "gnu")
1149  * End:
1150  */
vlib_log_class_t vlib_log_register_class(char *class, char *subclass)
Definition: log.c:339
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:133
struct ibv_mr * mr
Definition: rdma.h:220
volatile u32 * dv_sq_dbrec
Definition: rdma.h:136
struct mlx5_cqe64 * dv_cq_cqes
Definition: rdma.h:138
#define vec_foreach_index(var, v)
Iterate over vector indices.
static u32 rdma_dev_set_ucast(rdma_device_t *rd)
Definition: device.c:135
u8 * linux_ifname
Definition: rdma.h:211
__clib_export u8 * clib_sysfs_link_to_name(char *link)
Definition: sysfs.c:91
vl_api_mac_address_t mac
Definition: l2.api:559
rdma_mlx5_wqe_t * dv_sq_wqes
Definition: rdma.h:135
vl_api_wireguard_peer_flags_t flags
Definition: wireguard.api:105
u32 wq_stride
Definition: rdma.h:91
rdma_rss6_t rss6
Definition: rdma.h:287
void ethernet_delete_interface(vnet_main_t *vnm, u32 hw_if_index)
Definition: interface.c:393
struct ibv_flow * flow_mcast6
Definition: rdma.h:227
#define ntohs(x)
Definition: af_xdp.bpf.c:29
static u32 rdma_rxq_destroy_flow(const rdma_device_t *rd, struct ibv_flow **flow)
Definition: device.c:96
#define pool_get_zero(P, E)
Allocate an object E from a pool P and zero it.
Definition: pool.h:258
volatile u32 * cq_db
Definition: rdma.h:88
u32 cqn
Definition: rdma.h:89
u8 n_ds_per_wqe
Definition: rdma.h:111
#define rdma_log(lvl, dev, f,...)
Definition: device.c:58
format_function_t format_rdma_device
Definition: rdma.h:301
unsigned long u64
Definition: types.h:89
#define RDMA_RXQ_MAX_CHAIN_LOG_SZ
Definition: rdma.h:174
u32 size
Definition: rdma.h:76
static u32 rdma_dev_set_promisc(rdma_device_t *rd)
Definition: device.c:112
vlib_pci_device_info_t * pci
Definition: rdma.h:209
clib_memset(h->entries, 0, sizeof(h->entries[0]) *entries)
u32 dev_instance
Definition: rdma.h:214
static clib_error_t * rdma_rxq_init(vlib_main_t *vm, rdma_device_t *rd, u16 qid, u32 n_desc, u8 no_multi_seg, u16 max_pktlen)
Definition: device.c:425
vnet_hw_interface_capabilities_t caps
Definition: interface.h:645
static clib_error_t * rdma_rxq_finalize(vlib_main_t *vm, rdma_device_t *rd)
Definition: device.c:653
u64 private_data
Definition: file.h:64
u8 opcode_cqefmt_se_owner
Definition: rdma_mlx5dv.h:59
static vnet_hw_interface_t * vnet_get_hw_interface(vnet_main_t *vnm, u32 hw_if_index)
#define RDMA_TXQ_DV_INVALID_ID
Definition: rdma.h:166
u32 file_descriptor
Definition: file.h:54
struct ibv_wq * wq
Definition: rdma.h:74
static clib_error_t * rdma_dev_init(vlib_main_t *vm, rdma_device_t *rd, rdma_create_if_args_t *args)
Definition: device.c:813
volatile u32 * dv_cq_dbrec
Definition: rdma.h:139
u32 per_interface_next_index
Definition: rdma.h:202
static void vlib_pci_free_device_info(vlib_pci_device_info_t *di)
Definition: pci.h:114
vlib_buffer_main_t * buffer_main
Definition: main.h:165
rdma_main_t rdma_main
Definition: device.c:47
vl_api_fib_path_t path
Definition: mfib_types.api:44
#define vec_validate_aligned(V, I, A)
Make sure vector is long enough for given index (no header, specified alignment)
Definition: vec.h:535
#define ETHERNET_INTERFACE_FLAG_DEFAULT_L3
Definition: ethernet.h:160
clib_error_t * vnet_hw_interface_set_flags(vnet_main_t *vnm, u32 hw_if_index, vnet_hw_interface_flags_t flags)
Definition: interface.c:513
vhost_vring_addr_t addr
Definition: vhost_user.h:130
mac_address_t hwaddr
Definition: rdma.h:212
static uword vlib_node_add_next(vlib_main_t *vm, uword node, uword next_node)
Definition: node_funcs.h:1177
unsigned char u8
Definition: types.h:56
static uword min_log2(uword x)
Definition: clib.h:176
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
clib_file_function_t * read_function
Definition: file.h:67
static vnet_sw_interface_t * vnet_get_hw_sw_interface(vnet_main_t *vnm, u32 hw_if_index)
unsigned int u32
Definition: types.h:88
vlib_log_class_t log_class
Definition: rdma.h:262
static void rdma_async_event_cleanup(rdma_device_t *rd)
Definition: device.c:350
vlib_frame_t * f
struct ibv_flow * flow_ucast6
Definition: rdma.h:226
#define rdma_log__(lvl, dev, f,...)
Definition: device.c:50
rdma_per_thread_data_t * per_thread_data
Definition: rdma.h:260
if(node->flags &VLIB_NODE_FLAG_TRACE) vnet_interface_output_trace(vm
VNET_DEVICE_CLASS(af_xdp_device_class)
struct ibv_flow * flow_mcast4
Definition: rdma.h:225
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:172
static void rdma_set_interface_next_node(vnet_main_t *vnm, u32 hw_if_index, u32 node_index)
Definition: device.c:1085
struct ibv_pd * pd
Definition: rdma.h:219
static uword sysfs_path_to_pci_addr(char *path, vlib_pci_addr_t *addr)
Definition: device.c:863
#define vec_new(T, N)
Create new vector of given type and length (unspecified alignment, no header).
Definition: vec.h:365
rdma_device_t * devices
Definition: rdma.h:261
description fragment has unexpected format
Definition: map.api:433
uword buffer_mem_size
Definition: buffer.h:481
#define vec_elt_at_index(v, i)
Get vector value at index i checking that i is in bounds.
struct ibv_cq * cq
Definition: rdma.h:159
rdma_rss4_t rss4
Definition: rdma.h:286
#define clib_error_return(e, args...)
Definition: error.h:99
static void rdma_dev_cleanup(rdma_device_t *rd)
Definition: device.c:380
vnet_main_t * vnet_get_main(void)
clib_file_main_t file_main
Definition: main.c:63
rdma_rss6_t
Definition: rdma.h:186
#define clib_error_create(args...)
Definition: error.h:96
int __clib_unused rv
Definition: application.c:491
#define VLIB_FRAME_SIZE
Definition: node.h:369
rdma_rss6_t rss6
Definition: rdma.h:216
void unformat_init_string(unformat_input_t *input, char *string, int string_len)
Definition: unformat.c:1029
u32 flags
Definition: rdma.h:201
u32 queue_index
Definition: rdma.h:93
static clib_error_t * rdma_txq_init(vlib_main_t *vm, rdma_device_t *rd, u16 qid, u32 n_desc)
Definition: device.c:714
u32 * bufs
Definition: rdma.h:75
static uint64_t rdma_rss42ibv(const rdma_rss4_t rss4)
Definition: device.c:615
vlib_pci_device_info_t * vlib_pci_get_device_info(vlib_main_t *vm, vlib_pci_addr_t *addr, clib_error_t **error)
Definition: pci.c:202
u8 * description
Definition: file.h:70
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:553
u32 vnet_hw_if_register_rx_queue(vnet_main_t *vnm, u32 hw_if_index, u32 queue_id, u32 thread_index)
Definition: rx_queue.c:64
static_always_inline void mac_address_from_bytes(mac_address_t *mac, const u8 *bytes)
Definition: mac_address.h:92
u32 * second_bufs
Definition: rdma.h:106
clib_error_t * rdma_init(vlib_main_t *vm)
Definition: device.c:1116
struct _unformat_input_t unformat_input_t
unsigned short u16
Definition: types.h:57
static clib_error_t * rdma_interface_admin_up_down(vnet_main_t *vnm, u32 hw_if_index, u32 flags)
Definition: device.c:1060
vlib_node_registration_t rdma_input_node
(constructor) VLIB_REGISTER_NODE (rdma_input_node)
Definition: input.c:1045
#define clib_error_return_unix(e, args...)
Definition: error.h:102
static u32 rdma_dev_change_mtu(rdma_device_t *rd)
Definition: device.c:187
struct ibv_cq * cq
Definition: rdma.h:73
u32 buf_sz
Definition: rdma.h:92
#define pool_put(P, E)
Free an object E in pool P.
Definition: pool.h:305
unformat_function_t unformat_vlib_pci_addr
Definition: pci.h:325
struct ibv_cq * ibv_cq
Definition: rdma.h:129
struct ibv_qp * qp
Definition: rdma.h:160
vlib_main_t * vm
X-connect all packets from the HOST to the PHY.
Definition: nat44_ei.c:3047
void vnet_hw_if_update_runtime_data(vnet_main_t *vnm, u32 hw_if_index)
Definition: runtime.c:58
static u8 rdma_rss_hash_key[]
Definition: device.c:34
#define vlib_log_err(...)
Definition: log.h:133
static_always_inline u32 vlib_buffer_get_default_data_size(vlib_main_t *vm)
Definition: buffer_funcs.h:122
vlib_buffer_t buffer_template
Definition: rdma.h:255
#define RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ
Definition: rdma.h:176
u32 hw_if_index
Definition: rdma.h:204
u32 wqe_cnt
Definition: rdma.h:90
u8 log_stride_per_wqe
Definition: rdma.h:99
struct ibv_rwq_ind_table * rx_rwq_ind_tbl
Definition: rdma.h:223
clib_error_t * error
Definition: rdma.h:229
rdma_rss4_t rss4
Definition: rdma.h:215
static uint64_t rdma_rss62ibv(const rdma_rss6_t rss6)
Definition: device.c:634
clib_error_t * error
Definition: rdma.h:292
rdma_mode_t mode
Definition: rdma.h:282
static void rdma_unregister_interface(vnet_main_t *vnm, rdma_device_t *rd)
Definition: device.c:373
struct ibv_flow * flow_ucast4
Definition: rdma.h:224
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:261
#define vec_free(V)
Free vector&#39;s memory (no header).
Definition: vec.h:395
vl_api_pnat_mask_t mask
Definition: pnat.api:45
u8 * driver_name
Definition: pci.h:82
u32 lkey
Definition: rdma.h:205
#define ETHERNET_INTERFACE_FLAG_MTU
Definition: ethernet.h:166
#define ETHERNET_INTERFACE_FLAG_ACCEPT_ALL
Definition: ethernet.h:163
#define RDMA_TXQ_BUF_SZ(txq)
Definition: rdma.h:168
u32 sw_if_index
Definition: rdma.h:203
u16 log2_cq_size
Definition: rdma.h:80
#define ASSERT(truth)
format_function_t format_rdma_device_name
Definition: rdma.h:302
u8 * n_used_per_chain
Definition: rdma.h:104
u8 bufs_log2sz
Definition: rdma.h:147
static uword clib_file_add(clib_file_main_t *um, clib_file_t *template)
Definition: file.h:96
static void clib_file_del_by_index(clib_file_main_t *um, uword index)
Definition: file.h:119
u8 log_wqe_sz
Definition: rdma.h:114
u8 * name
Definition: rdma.h:210
volatile u64 * dv_sq_db
Definition: rdma.h:137
rdma_txq_t * txqs
Definition: rdma.h:200
u8 dv_cq_log2sz
Definition: rdma.h:149
u8 dv_sq_log2sz
Definition: rdma.h:148
static u32 rdma_flag_change(vnet_main_t *vnm, vnet_hw_interface_t *hw, u32 flags)
Definition: device.c:194
struct ibv_qp * rx_qp4
Definition: rdma.h:221
vlib_pci_addr_t addr
Definition: pci.h:66
rdma_rxq_t * rxqs
Definition: rdma.h:199
#define clib_max(x, y)
Definition: clib.h:335
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:38
vl_api_ip4_address_t hi
Definition: arp.api:37
vnet_device_class_t rdma_device_class
static uword is_pow2(uword x)
Definition: clib.h:267
#define vec_elt(v, i)
Get vector value at index i.
u8 dv_wqe_tmpl[64]
Definition: rdma.h:153
vl_api_flow_t flow
Definition: flow_types.api:240
void vnet_hw_if_set_input_node(vnet_main_t *vnm, u32 hw_if_index, u32 node_index)
Definition: rx_queue.c:157
Definition: defs.h:47
mlx5dv_cqe_t * cqes
Definition: rdma.h:84
u32 async_event_clib_file_index
Definition: rdma.h:213
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
u16 n_total_additional_segs
Definition: rdma.h:110
static void ethernet_mac_address_generate(u8 *mac)
Definition: mac_address.h:74
clib_error_t * ethernet_register_interface(vnet_main_t *vnm, u32 dev_class_index, u32 dev_instance, const u8 *address, u32 *hw_if_index_return, ethernet_flag_change_function_t flag_change)
Definition: interface.c:348
rdma_rss4_t
Definition: rdma.h:178
static uword max_log2(uword x)
Definition: clib.h:223
VLIB buffer representation.
Definition: buffer.h:111
u64 uword
Definition: types.h:112
static void unformat_free(unformat_input_t *i)
Definition: format.h:155
static struct ibv_flow * rdma_rxq_init_flow(const rdma_device_t *rd, struct ibv_qp *qp, const mac_address_t *mac, const mac_address_t *mask, u16 ether_type, u32 flags)
Definition: device.c:62
static void rdma_update_state(vnet_main_t *vnm, rdma_device_t *rd, int port)
Definition: device.c:214
uword buffer_mem_start
Definition: buffer.h:480
u32 * bufs
Definition: rdma.h:143
#define foreach_rdma_tx_func_error
Definition: rdma.h:314
node node_index
volatile u32 * wq_db
Definition: rdma.h:87
#define clib_error_free(e)
Definition: error.h:86
static char * rdma_tx_func_error_strings[]
Definition: device.c:1097
u16 port
Definition: lb_types.api:73
clib_file_function_t * error_function
Definition: file.h:67
static clib_error_t * rdma_mac_change(vnet_hw_interface_t *hw, const u8 *old, const u8 *new)
Definition: device.c:173
#define vnet_buffer(b)
Definition: buffer.h:437
static clib_error_t * rdma_register_interface(vnet_main_t *vnm, rdma_device_t *rd)
Definition: device.c:356
static clib_error_t * rdma_async_event_init(rdma_device_t *rd)
Definition: device.c:324
static vlib_thread_main_t * vlib_get_thread_main()
Definition: global_funcs.h:56
#define vlib_log_emerg(...)
Definition: log.h:130
#define vec_foreach(var, vec)
Vector iterator.
static clib_error_t * rdma_async_event_error_ready(clib_file_t *f)
Definition: device.c:281
#define MLX5_ETH_L2_INLINE_HEADER_SIZE
Definition: rdma.h:44
#define RDMA_MLX5_WQE_DS
Definition: rdma.h:65
Definition: file.h:51
static clib_error_t * rdma_async_event_read_ready(clib_file_t *f)
Definition: device.c:289
void rdma_delete_if(vlib_main_t *vm, rdma_device_t *rd)
Definition: device.c:1052
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:59
void rdma_create_if(vlib_main_t *vm, rdma_create_if_args_t *args)
Definition: device.c:881
#define STATIC_ASSERT_SIZEOF(d, s)
static u8 vlib_buffer_pool_get_default_for_numa(vlib_main_t *vm, u32 numa_node)
Definition: buffer_funcs.h:189
#define VNET_HW_IF_RXQ_THREAD_ANY
Definition: interface.h:598
volatile u8 ref_count
Reference count for this buffer.
Definition: buffer.h:139
__clib_export u8 * format_clib_error(u8 *s, va_list *va)
Definition: error.c:191
#define VLIB_INITS(...)
Definition: init.h:352
static void vnet_hw_interface_set_link_speed(vnet_main_t *vnm, u32 hw_if_index, u32 link_speed)
struct ibv_context * ctx
Definition: rdma.h:218
mlx5dv_wqe_ds_t * wqes
Definition: rdma.h:85
uword unformat(unformat_input_t *i, const char *fmt,...)
Definition: unformat.c:978
u32 ethernet_set_flags(vnet_main_t *vnm, u32 hw_if_index, u32 flags)
Definition: interface.c:441
struct ibv_qp * rx_qp6
Definition: rdma.h:222
struct ibv_qp * ibv_qp
Definition: rdma.h:130