FD.io VPP  v17.01.1-3-gc6833f8
Vector Packet Processing
vhost-user.c
Go to the documentation of this file.
1 /*
2  *------------------------------------------------------------------
3  * vhost.c - vhost-user
4  *
5  * Copyright (c) 2014 Cisco and/or its affiliates.
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at:
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *------------------------------------------------------------------
18  */
19 
20 #include <fcntl.h> /* for open */
21 #include <sys/ioctl.h>
22 #include <sys/socket.h>
23 #include <sys/un.h>
24 #include <sys/stat.h>
25 #include <sys/types.h>
26 #include <sys/uio.h> /* for iovec */
27 #include <netinet/in.h>
28 #include <sys/vfs.h>
29 
30 #include <linux/if_arp.h>
31 #include <linux/if_tun.h>
32 
33 #include <vlib/vlib.h>
34 #include <vlib/unix/unix.h>
35 
36 #include <vnet/ip/ip.h>
37 
38 #include <vnet/ethernet/ethernet.h>
39 #include <vnet/devices/devices.h>
40 #include <vnet/feature/feature.h>
41 
43 
44 /**
45  * @file
46  * @brief vHost User Device Driver.
47  *
48  * This file contains the source code for vHost User interface.
49  */
50 
51 
52 #define VHOST_USER_DEBUG_SOCKET 0
53 #define VHOST_DEBUG_VQ 0
54 
55 #if VHOST_USER_DEBUG_SOCKET == 1
56 #define DBG_SOCK(args...) clib_warning(args);
57 #else
58 #define DBG_SOCK(args...)
59 #endif
60 
61 #if VHOST_DEBUG_VQ == 1
62 #define DBG_VQ(args...) clib_warning(args);
63 #else
64 #define DBG_VQ(args...)
65 #endif
66 
67 /*
68  * When an RX queue is down but active, received packets
69  * must be discarded. This value controls up to how many
70  * packets will be discarded during each round.
71  */
72 #define VHOST_USER_DOWN_DISCARD_COUNT 256
73 
74 /*
75  * When the number of available buffers gets under this threshold,
76  * RX node will start discarding packets.
77  */
78 #define VHOST_USER_RX_BUFFER_STARVATION 32
79 
80 /*
81  * On the receive side, the host should free descriptors as soon
82  * as possible in order to avoid TX drop in the VM.
83  * This value controls the number of copy operations that are stacked
84  * before copy is done for all and descriptors are given back to
85  * the guest.
86  * The value 64 was obtained by testing (48 and 128 were not as good).
87  */
88 #define VHOST_USER_RX_COPY_THRESHOLD 64
89 
90 #define UNIX_GET_FD(unixfd_idx) \
91  (unixfd_idx != ~0) ? \
92  pool_elt_at_index (unix_main.file_pool, \
93  unixfd_idx)->file_descriptor : -1;
94 
95 #define foreach_virtio_trace_flags \
96  _ (SIMPLE_CHAINED, 0, "Simple descriptor chaining") \
97  _ (SINGLE_DESC, 1, "Single descriptor packet") \
98  _ (INDIRECT, 2, "Indirect descriptor") \
99  _ (MAP_ERROR, 4, "Memory mapping error")
100 
101 typedef enum
102 {
103 #define _(n,i,s) VIRTIO_TRACE_F_##n,
105 #undef _
107 
109 
110 #define foreach_vhost_user_tx_func_error \
111  _(NONE, "no error") \
112  _(NOT_READY, "vhost vring not ready") \
113  _(DOWN, "vhost interface is down") \
114  _(PKT_DROP_NOBUF, "tx packet drops (no available descriptors)") \
115  _(PKT_DROP_NOMRG, "tx packet drops (cannot merge descriptors)") \
116  _(MMAP_FAIL, "mmap failure") \
117  _(INDIRECT_OVERFLOW, "indirect descriptor table overflow")
118 
119 typedef enum
120 {
121 #define _(f,s) VHOST_USER_TX_FUNC_ERROR_##f,
123 #undef _
126 
128 #define _(n,s) s,
130 #undef _
131 };
132 
133 #define foreach_vhost_user_input_func_error \
134  _(NO_ERROR, "no error") \
135  _(NO_BUFFER, "no available buffer") \
136  _(MMAP_FAIL, "mmap failure") \
137  _(INDIRECT_OVERFLOW, "indirect descriptor overflows table") \
138  _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)") \
139  _(FULL_RX_QUEUE, "full rx queue (possible driver tx drop)")
140 
141 typedef enum
142 {
143 #define _(f,s) VHOST_USER_INPUT_FUNC_ERROR_##f,
145 #undef _
148 
150 #define _(n,s) s,
152 #undef _
153 };
154 
155 /* *INDENT-OFF* */
156 static vhost_user_main_t vhost_user_main = {
157  .mtu_bytes = 1518,
158 };
159 
160 VNET_HW_INTERFACE_CLASS (vhost_interface_class, static) = {
161  .name = "vhost-user",
162 };
163 /* *INDENT-ON* */
164 
165 static u8 *
167 {
168  u32 i = va_arg (*args, u32);
169  u32 show_dev_instance = ~0;
171 
173  show_dev_instance = vum->show_dev_instance_by_real_dev_instance[i];
174 
175  if (show_dev_instance != ~0)
176  i = show_dev_instance;
177 
178  s = format (s, "VirtualEthernet0/0/%d", i);
179  return s;
180 }
181 
182 static int
184 {
185  // FIXME: check if the new dev instance is already used
188  hi->dev_instance, ~0);
189 
191  new_dev_instance;
192 
193  DBG_SOCK ("renumbered vhost-user interface dev_instance %d to %d",
194  hi->dev_instance, new_dev_instance);
195 
196  return 0;
197 }
198 
201 {
202  int i = *hint;
203  if (PREDICT_TRUE ((vui->regions[i].guest_phys_addr <= addr) &&
204  ((vui->regions[i].guest_phys_addr +
205  vui->regions[i].memory_size) > addr)))
206  {
207  return (void *) (vui->region_mmap_addr[i] + addr -
208  vui->regions[i].guest_phys_addr);
209  }
210 #if __SSE4_2__
211  __m128i rl, rh, al, ah, r;
212  al = _mm_set1_epi64x (addr + 1);
213  ah = _mm_set1_epi64x (addr);
214 
215  rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[0]);
216  rl = _mm_cmpgt_epi64 (al, rl);
217  rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[0]);
218  rh = _mm_cmpgt_epi64 (rh, ah);
219  r = _mm_and_si128 (rl, rh);
220 
221  rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[2]);
222  rl = _mm_cmpgt_epi64 (al, rl);
223  rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[2]);
224  rh = _mm_cmpgt_epi64 (rh, ah);
225  r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x22);
226 
227  rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[4]);
228  rl = _mm_cmpgt_epi64 (al, rl);
229  rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[4]);
230  rh = _mm_cmpgt_epi64 (rh, ah);
231  r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x44);
232 
233  rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[6]);
234  rl = _mm_cmpgt_epi64 (al, rl);
235  rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[6]);
236  rh = _mm_cmpgt_epi64 (rh, ah);
237  r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x88);
238 
239  r = _mm_shuffle_epi8 (r, _mm_set_epi64x (0, 0x0e060c040a020800));
240  i = __builtin_ctzll (_mm_movemask_epi8 (r));
241 
242  if (i < vui->nregions)
243  {
244  *hint = i;
245  return (void *) (vui->region_mmap_addr[i] + addr -
246  vui->regions[i].guest_phys_addr);
247  }
248 
249 #else
250  for (i = 0; i < vui->nregions; i++)
251  {
252  if ((vui->regions[i].guest_phys_addr <= addr) &&
253  ((vui->regions[i].guest_phys_addr + vui->regions[i].memory_size) >
254  addr))
255  {
256  *hint = i;
257  return (void *) (vui->region_mmap_addr[i] + addr -
258  vui->regions[i].guest_phys_addr);
259  }
260  }
261 #endif
262  DBG_VQ ("failed to map guest mem addr %llx", addr);
263  *hint = 0;
264  return 0;
265 }
266 
267 static inline void *
269 {
270  int i;
271  for (i = 0; i < vui->nregions; i++)
272  {
273  if ((vui->regions[i].userspace_addr <= addr) &&
274  ((vui->regions[i].userspace_addr + vui->regions[i].memory_size) >
275  addr))
276  {
277  return (void *) (vui->region_mmap_addr[i] + addr -
278  vui->regions[i].userspace_addr);
279  }
280  }
281  return 0;
282 }
283 
284 static long
286 {
287  struct statfs s;
288  fstatfs (fd, &s);
289  return s.f_bsize;
290 }
291 
292 static void
294 {
295  int i, r;
296  for (i = 0; i < vui->nregions; i++)
297  {
298  if (vui->region_mmap_addr[i] != (void *) -1)
299  {
300 
301  long page_sz = get_huge_page_size (vui->region_mmap_fd[i]);
302 
303  ssize_t map_sz = (vui->regions[i].memory_size +
304  vui->regions[i].mmap_offset +
305  page_sz) & ~(page_sz - 1);
306 
307  r =
308  munmap (vui->region_mmap_addr[i] - vui->regions[i].mmap_offset,
309  map_sz);
310 
311  DBG_SOCK
312  ("unmap memory region %d addr 0x%lx len 0x%lx page_sz 0x%x", i,
313  vui->region_mmap_addr[i], map_sz, page_sz);
314 
315  vui->region_mmap_addr[i] = (void *) -1;
316 
317  if (r == -1)
318  {
319  clib_warning ("failed to unmap memory region (errno %d)",
320  errno);
321  }
322  close (vui->region_mmap_fd[i]);
323  }
324  }
325  vui->nregions = 0;
326 }
327 
328 static void
330 {
331  //Let's try to assign one queue to each thread
332  u32 qid = 0;
333  u32 cpu_index = 0;
334  vui->use_tx_spinlock = 0;
335  while (1)
336  {
337  for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++)
338  {
339  vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)];
340  if (!rxvq->started || !rxvq->enabled)
341  continue;
342 
343  vui->per_cpu_tx_qid[cpu_index] = qid;
344  cpu_index++;
345  if (cpu_index == vlib_get_thread_main ()->n_vlib_mains)
346  return;
347  }
348  //We need to loop, meaning the spinlock has to be used
349  vui->use_tx_spinlock = 1;
350  if (cpu_index == 0)
351  {
352  //Could not find a single valid one
353  for (cpu_index = 0;
354  cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++)
355  {
356  vui->per_cpu_tx_qid[cpu_index] = 0;
357  }
358  return;
359  }
360  }
361 }
362 
363 static void
365 {
367  vhost_user_intf_t *vui;
368  vhost_cpu_t *vhc;
369  u32 *workers = 0;
370 
371  //Let's list all workers cpu indexes
372  u32 i;
373  for (i = vum->input_cpu_first_index;
374  i < vum->input_cpu_first_index + vum->input_cpu_count; i++)
375  {
377  vhost_user_input_node.index,
378  VLIB_NODE_STATE_DISABLED);
379  vec_add1 (workers, i);
380  }
381 
382  vec_foreach (vhc, vum->cpus)
383  {
385  }
386 
387  i = 0;
389  /* *INDENT-OFF* */
390  pool_foreach (vui, vum->vhost_user_interfaces, {
391  u32 *vui_workers = vec_len (vui->workers) ? vui->workers : workers;
392  u32 qid;
393  for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++)
394  {
395  vhost_user_vring_t *txvq =
396  &vui->vrings[VHOST_VRING_IDX_TX (qid)];
397  if (!txvq->started)
398  continue;
399 
400  i %= vec_len (vui_workers);
401  u32 cpu_index = vui_workers[i];
402  i++;
403  vhc = &vum->cpus[cpu_index];
404 
405  iaq.qid = qid;
406  iaq.vhost_iface_index = vui - vum->vhost_user_interfaces;
407  vec_add1 (vhc->rx_queues, iaq);
408  vlib_node_set_state (vlib_mains ? vlib_mains[cpu_index] :
409  &vlib_global_main, vhost_user_input_node.index,
410  VLIB_NODE_STATE_POLLING);
411  }
412  });
413  /* *INDENT-ON* */
414 }
415 
416 static int
417 vhost_user_thread_placement (u32 sw_if_index, u32 worker_thread_index, u8 del)
418 {
420  vhost_user_intf_t *vui;
422 
423  if (worker_thread_index < vum->input_cpu_first_index ||
424  worker_thread_index >=
426  return -1;
427 
428  if (!(hw = vnet_get_sup_hw_interface (vnet_get_main (), sw_if_index)))
429  return -2;
430 
432  u32 found = ~0, *w;
433  vec_foreach (w, vui->workers)
434  {
435  if (*w == worker_thread_index)
436  {
437  found = w - vui->workers;
438  break;
439  }
440  }
441 
442  if (del)
443  {
444  if (found == ~0)
445  return -3;
446  vec_del1 (vui->workers, found);
447  }
448  else if (found == ~0)
449  {
450  vec_add1 (vui->workers, worker_thread_index);
451  }
452 
454  return 0;
455 }
456 
457 /** @brief Returns whether at least one TX and one RX vring are enabled */
458 int
460 {
461  int i, found[2] = { }; //RX + TX
462 
463  for (i = 0; i < VHOST_VRING_MAX_N; i++)
464  if (vui->vrings[i].started && vui->vrings[i].enabled)
465  found[i & 1] = 1;
466 
467  return found[0] && found[1];
468 }
469 
470 static void
472 {
473  /* if we have pointers to descriptor table, go up */
474  int is_up = vhost_user_intf_ready (vui);
475  if (is_up != vui->is_up)
476  {
477  DBG_SOCK ("interface %d %s", vui->sw_if_index,
478  is_up ? "ready" : "down");
481  0);
482  vui->is_up = is_up;
483  }
486 }
487 
488 static clib_error_t *
490 {
491  __attribute__ ((unused)) int n;
492  u8 buff[8];
493  n = read (uf->file_descriptor, ((char *) &buff), 8);
494  return 0;
495 }
496 
497 static clib_error_t *
499 {
500  __attribute__ ((unused)) int n;
501  u8 buff[8];
502  vhost_user_intf_t *vui =
503  pool_elt_at_index (vhost_user_main.vhost_user_interfaces,
504  uf->private_data >> 8);
505  u32 qid = uf->private_data & 0xff;
506  n = read (uf->file_descriptor, ((char *) &buff), 8);
507  DBG_SOCK ("if %d KICK queue %d", uf->private_data >> 8, qid);
508 
510  vui->vrings[qid].started = 1;
513  return 0;
514 }
515 
516 /**
517  * @brief Try once to lock the vring
518  * @return 0 on success, non-zero on failure.
519  */
520 static inline int
522 {
523  return __sync_lock_test_and_set (vui->vring_locks[qid], 1);
524 }
525 
526 /**
527  * @brief Spin until the vring is successfully locked
528  */
529 static inline void
531 {
532  while (vhost_user_vring_try_lock (vui, qid))
533  ;
534 }
535 
536 /**
537  * @brief Unlock the vring lock
538  */
539 static inline void
541 {
542  *vui->vring_locks[qid] = 0;
543 }
544 
545 static inline void
547 {
548  vhost_user_vring_t *vring = &vui->vrings[qid];
549  memset (vring, 0, sizeof (*vring));
550  vring->kickfd_idx = ~0;
551  vring->callfd_idx = ~0;
552  vring->errfd = -1;
553 
554  /*
555  * We have a bug with some qemu 2.5, and this may be a fix.
556  * Feel like interpretation holy text, but this is from vhost-user.txt.
557  * "
558  * One queue pair is enabled initially. More queues are enabled
559  * dynamically, by sending message VHOST_USER_SET_VRING_ENABLE.
560  * "
561  * Don't know who's right, but this is what DPDK does.
562  */
563  if (qid == 0 || qid == 1)
564  vring->enabled = 1;
565 }
566 
567 static inline void
569 {
570  vhost_user_vring_t *vring = &vui->vrings[qid];
571  if (vring->kickfd_idx != ~0)
572  {
574  vring->kickfd_idx);
575  unix_file_del (&unix_main, uf);
576  vring->kickfd_idx = ~0;
577  }
578  if (vring->callfd_idx != ~0)
579  {
581  vring->callfd_idx);
582  unix_file_del (&unix_main, uf);
583  vring->callfd_idx = ~0;
584  }
585  if (vring->errfd != -1)
586  close (vring->errfd);
587  vhost_user_vring_init (vui, qid);
588 }
589 
590 static inline void
592 {
593  vnet_main_t *vnm = vnet_get_main ();
594  int q;
595 
597 
598  if (vui->unix_file_index != ~0)
599  {
601  vui->unix_file_index = ~0;
602  }
603 
604  vui->is_up = 0;
605 
606  for (q = 0; q < VHOST_VRING_MAX_N; q++)
607  vhost_user_vring_close (vui, q);
608 
609  unmap_all_mem_regions (vui);
610  DBG_SOCK ("interface ifindex %d disconnected", vui->sw_if_index);
611 }
612 
613 #define VHOST_LOG_PAGE 0x1000
616  u64 addr, u64 len, u8 is_host_address)
617 {
618  if (PREDICT_TRUE (vui->log_base_addr == 0
619  || !(vui->features & (1 << FEAT_VHOST_F_LOG_ALL))))
620  {
621  return;
622  }
623  if (is_host_address)
624  {
625  addr = (u64) map_user_mem (vui, (uword) addr);
626  }
627  if (PREDICT_FALSE ((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size))
628  {
629  DBG_SOCK ("vhost_user_log_dirty_pages(): out of range\n");
630  return;
631  }
632 
634  u64 page = addr / VHOST_LOG_PAGE;
635  while (page * VHOST_LOG_PAGE < addr + len)
636  {
637  ((u8 *) vui->log_base_addr)[page / 8] |= 1 << page % 8;
638  page++;
639  }
640 }
641 
644 {
645  vhost_user_log_dirty_pages_2 (vui, addr, len, 0);
646 }
647 
648 #define vhost_user_log_dirty_ring(vui, vq, member) \
649  if (PREDICT_FALSE(vq->log_used)) { \
650  vhost_user_log_dirty_pages(vui, vq->log_guest_addr + STRUCT_OFFSET_OF(vring_used_t, member), \
651  sizeof(vq->used->member)); \
652  }
653 
654 static clib_error_t *
656 {
657  int n, i;
658  int fd, number_of_fds = 0;
659  int fds[VHOST_MEMORY_MAX_NREGIONS];
660  vhost_user_msg_t msg;
661  struct msghdr mh;
662  struct iovec iov[1];
664  vhost_user_intf_t *vui;
665  struct cmsghdr *cmsg;
666  u8 q;
667  unix_file_t template = { 0 };
668  vnet_main_t *vnm = vnet_get_main ();
669 
670  vui = pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data);
671 
672  char control[CMSG_SPACE (VHOST_MEMORY_MAX_NREGIONS * sizeof (int))];
673 
674  memset (&mh, 0, sizeof (mh));
675  memset (control, 0, sizeof (control));
676 
677  for (i = 0; i < VHOST_MEMORY_MAX_NREGIONS; i++)
678  fds[i] = -1;
679 
680  /* set the payload */
681  iov[0].iov_base = (void *) &msg;
682  iov[0].iov_len = VHOST_USER_MSG_HDR_SZ;
683 
684  mh.msg_iov = iov;
685  mh.msg_iovlen = 1;
686  mh.msg_control = control;
687  mh.msg_controllen = sizeof (control);
688 
689  n = recvmsg (uf->file_descriptor, &mh, 0);
690 
691  /* Stop workers to avoid end of the world */
693 
694  if (n != VHOST_USER_MSG_HDR_SZ)
695  {
696  if (n == -1)
697  {
698  DBG_SOCK ("recvmsg returned error %d %s", errno, strerror (errno));
699  }
700  else
701  {
702  DBG_SOCK ("n (%d) != VHOST_USER_MSG_HDR_SZ (%d)",
704  }
705  goto close_socket;
706  }
707 
708  if (mh.msg_flags & MSG_CTRUNC)
709  {
710  DBG_SOCK ("MSG_CTRUNC is set");
711  goto close_socket;
712  }
713 
714  cmsg = CMSG_FIRSTHDR (&mh);
715 
716  if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) &&
717  (cmsg->cmsg_type == SCM_RIGHTS) &&
718  (cmsg->cmsg_len - CMSG_LEN (0) <=
719  VHOST_MEMORY_MAX_NREGIONS * sizeof (int)))
720  {
721  number_of_fds = (cmsg->cmsg_len - CMSG_LEN (0)) / sizeof (int);
722  clib_memcpy (fds, CMSG_DATA (cmsg), number_of_fds * sizeof (int));
723  }
724 
725  /* version 1, no reply bit set */
726  if ((msg.flags & 7) != 1)
727  {
728  DBG_SOCK ("malformed message received. closing socket");
729  goto close_socket;
730  }
731 
732  {
733  int rv;
734  rv =
735  read (uf->file_descriptor, ((char *) &msg) + VHOST_USER_MSG_HDR_SZ,
736  msg.size);
737  if (rv < 0)
738  {
739  DBG_SOCK ("read failed %s", strerror (errno));
740  goto close_socket;
741  }
742  else if (rv != msg.size)
743  {
744  DBG_SOCK ("message too short (read %dB should be %dB)", rv, msg.size);
745  goto close_socket;
746  }
747  }
748 
749  switch (msg.request)
750  {
752  msg.flags |= 4;
753  msg.u64 = (1ULL << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
754  (1ULL << FEAT_VIRTIO_NET_F_CTRL_VQ) |
755  (1ULL << FEAT_VIRTIO_F_ANY_LAYOUT) |
756  (1ULL << FEAT_VIRTIO_F_INDIRECT_DESC) |
757  (1ULL << FEAT_VHOST_F_LOG_ALL) |
758  (1ULL << FEAT_VIRTIO_NET_F_GUEST_ANNOUNCE) |
759  (1ULL << FEAT_VIRTIO_NET_F_MQ) |
760  (1ULL << FEAT_VHOST_USER_F_PROTOCOL_FEATURES) |
761  (1ULL << FEAT_VIRTIO_F_VERSION_1);
762  msg.u64 &= vui->feature_mask;
763  msg.size = sizeof (msg.u64);
764  DBG_SOCK ("if %d msg VHOST_USER_GET_FEATURES - reply 0x%016llx",
765  vui->hw_if_index, msg.u64);
766  break;
767 
769  DBG_SOCK ("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx",
770  vui->hw_if_index, msg.u64);
771 
772  vui->features = msg.u64;
773 
774  if (vui->features &
775  ((1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
776  (1ULL << FEAT_VIRTIO_F_VERSION_1)))
777  vui->virtio_net_hdr_sz = 12;
778  else
779  vui->virtio_net_hdr_sz = 10;
780 
781  vui->is_any_layout =
782  (vui->features & (1 << FEAT_VIRTIO_F_ANY_LAYOUT)) ? 1 : 0;
783 
786  vui->is_up = 0;
787 
788  /*for (q = 0; q < VHOST_VRING_MAX_N; q++)
789  vhost_user_vring_close(&vui->vrings[q]); */
790 
791  break;
792 
794  DBG_SOCK ("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d",
795  vui->hw_if_index, msg.memory.nregions);
796 
797  if ((msg.memory.nregions < 1) ||
798  (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS))
799  {
800 
801  DBG_SOCK ("number of mem regions must be between 1 and %i",
802  VHOST_MEMORY_MAX_NREGIONS);
803 
804  goto close_socket;
805  }
806 
807  if (msg.memory.nregions != number_of_fds)
808  {
809  DBG_SOCK ("each memory region must have FD");
810  goto close_socket;
811  }
812  unmap_all_mem_regions (vui);
813  for (i = 0; i < msg.memory.nregions; i++)
814  {
815  clib_memcpy (&(vui->regions[i]), &msg.memory.regions[i],
816  sizeof (vhost_user_memory_region_t));
817 
818  long page_sz = get_huge_page_size (fds[i]);
819 
820  /* align size to 2M page */
821  ssize_t map_sz = (vui->regions[i].memory_size +
822  vui->regions[i].mmap_offset +
823  page_sz) & ~(page_sz - 1);
824 
825  vui->region_mmap_addr[i] = mmap (0, map_sz, PROT_READ | PROT_WRITE,
826  MAP_SHARED, fds[i], 0);
827  vui->region_guest_addr_lo[i] = vui->regions[i].guest_phys_addr;
828  vui->region_guest_addr_hi[i] = vui->regions[i].guest_phys_addr +
829  vui->regions[i].memory_size;
830 
831  DBG_SOCK
832  ("map memory region %d addr 0 len 0x%lx fd %d mapped 0x%lx "
833  "page_sz 0x%x", i, map_sz, fds[i], vui->region_mmap_addr[i],
834  page_sz);
835 
836  if (vui->region_mmap_addr[i] == MAP_FAILED)
837  {
838  clib_warning ("failed to map memory. errno is %d", errno);
839  goto close_socket;
840  }
841  vui->region_mmap_addr[i] += vui->regions[i].mmap_offset;
842  vui->region_mmap_fd[i] = fds[i];
843  }
844  vui->nregions = msg.memory.nregions;
845  break;
846 
848  DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d",
849  vui->hw_if_index, msg.state.index, msg.state.num);
850 
851  if ((msg.state.num > 32768) || /* maximum ring size is 32768 */
852  (msg.state.num == 0) || /* it cannot be zero */
853  ((msg.state.num - 1) & msg.state.num)) /* must be power of 2 */
854  goto close_socket;
855  vui->vrings[msg.state.index].qsz = msg.state.num;
856  break;
857 
859  DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_ADDR idx %d",
860  vui->hw_if_index, msg.state.index);
861 
862  if (msg.state.index >= VHOST_VRING_MAX_N)
863  {
864  DBG_SOCK ("invalid vring index VHOST_USER_SET_VRING_ADDR:"
865  " %d >= %d", msg.state.index, VHOST_VRING_MAX_N);
866  goto close_socket;
867  }
868 
869  if (msg.size < sizeof (msg.addr))
870  {
871  DBG_SOCK ("vhost message is too short (%d < %d)",
872  msg.size, sizeof (msg.addr));
873  goto close_socket;
874  }
875 
876  vui->vrings[msg.state.index].desc = (vring_desc_t *)
877  map_user_mem (vui, msg.addr.desc_user_addr);
878  vui->vrings[msg.state.index].used = (vring_used_t *)
879  map_user_mem (vui, msg.addr.used_user_addr);
880  vui->vrings[msg.state.index].avail = (vring_avail_t *)
881  map_user_mem (vui, msg.addr.avail_user_addr);
882 
883  if ((vui->vrings[msg.state.index].desc == NULL) ||
884  (vui->vrings[msg.state.index].used == NULL) ||
885  (vui->vrings[msg.state.index].avail == NULL))
886  {
887  DBG_SOCK ("failed to map user memory for hw_if_index %d",
888  vui->hw_if_index);
889  goto close_socket;
890  }
891 
892  vui->vrings[msg.state.index].log_guest_addr = msg.addr.log_guest_addr;
893  vui->vrings[msg.state.index].log_used =
894  (msg.addr.flags & (1 << VHOST_VRING_F_LOG)) ? 1 : 0;
895 
896  /* Spec says: If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated,
897  the ring is initialized in an enabled state. */
898  if (!(vui->features & (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES)))
899  {
900  vui->vrings[msg.state.index].enabled = 1;
901  }
902 
903  vui->vrings[msg.state.index].last_used_idx =
904  vui->vrings[msg.state.index].last_avail_idx =
905  vui->vrings[msg.state.index].used->idx;
906 
907  /* tell driver that we don't want interrupts */
908  vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
909  break;
910 
912  DBG_SOCK ("if %d msg VHOST_USER_SET_OWNER", vui->hw_if_index);
913  break;
914 
916  DBG_SOCK ("if %d msg VHOST_USER_RESET_OWNER", vui->hw_if_index);
917  break;
918 
920  DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_CALL u64 %d",
921  vui->hw_if_index, msg.u64);
922 
923  q = (u8) (msg.u64 & 0xFF);
924 
925  /* if there is old fd, delete and close it */
926  if (vui->vrings[q].callfd_idx != ~0)
927  {
929  vui->vrings[q].callfd_idx);
930  unix_file_del (&unix_main, uf);
931  vui->vrings[q].callfd_idx = ~0;
932  }
933 
934  if (!(msg.u64 & 0x100))
935  {
936  if (number_of_fds != 1)
937  {
938  DBG_SOCK ("More than one fd received !");
939  goto close_socket;
940  }
941 
942  template.read_function = vhost_user_callfd_read_ready;
943  template.file_descriptor = fds[0];
944  template.private_data =
945  ((vui - vhost_user_main.vhost_user_interfaces) << 8) + q;
946  vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template);
947  }
948  else
949  vui->vrings[q].callfd_idx = ~0;
950  break;
951 
953  DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_KICK u64 %d",
954  vui->hw_if_index, msg.u64);
955 
956  q = (u8) (msg.u64 & 0xFF);
957 
958  if (vui->vrings[q].kickfd_idx != ~0)
959  {
961  vui->vrings[q].kickfd_idx);
962  unix_file_del (&unix_main, uf);
963  vui->vrings[q].kickfd_idx = ~0;
964  }
965 
966  if (!(msg.u64 & 0x100))
967  {
968  if (number_of_fds != 1)
969  {
970  DBG_SOCK ("More than one fd received !");
971  goto close_socket;
972  }
973 
974  template.read_function = vhost_user_kickfd_read_ready;
975  template.file_descriptor = fds[0];
976  template.private_data =
977  (((uword) (vui - vhost_user_main.vhost_user_interfaces)) << 8) +
978  q;
979  vui->vrings[q].kickfd_idx = unix_file_add (&unix_main, &template);
980  }
981  else
982  {
983  //When no kickfd is set, the queue is initialized as started
984  vui->vrings[q].kickfd_idx = ~0;
985  vui->vrings[q].started = 1;
986  }
987 
988  break;
989 
991  DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_ERR u64 %d",
992  vui->hw_if_index, msg.u64);
993 
994  q = (u8) (msg.u64 & 0xFF);
995 
996  if (vui->vrings[q].errfd != -1)
997  close (vui->vrings[q].errfd);
998 
999  if (!(msg.u64 & 0x100))
1000  {
1001  if (number_of_fds != 1)
1002  goto close_socket;
1003 
1004  vui->vrings[q].errfd = fds[0];
1005  }
1006  else
1007  vui->vrings[q].errfd = -1;
1008 
1009  break;
1010 
1012  DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
1013  vui->hw_if_index, msg.state.index, msg.state.num);
1014 
1015  vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
1016  break;
1017 
1019  DBG_SOCK ("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
1020  vui->hw_if_index, msg.state.index, msg.state.num);
1021 
1022  if (msg.state.index >= VHOST_VRING_MAX_N)
1023  {
1024  DBG_SOCK ("invalid vring index VHOST_USER_GET_VRING_BASE:"
1025  " %d >= %d", msg.state.index, VHOST_VRING_MAX_N);
1026  goto close_socket;
1027  }
1028 
1029  /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */
1030  vhost_user_vring_close (vui, msg.state.index);
1031 
1032  msg.state.num = vui->vrings[msg.state.index].last_avail_idx;
1033  msg.flags |= 4;
1034  msg.size = sizeof (msg.state);
1035  break;
1036 
1037  case VHOST_USER_NONE:
1038  DBG_SOCK ("if %d msg VHOST_USER_NONE", vui->hw_if_index);
1039 
1040  break;
1041 
1043  {
1044  DBG_SOCK ("if %d msg VHOST_USER_SET_LOG_BASE", vui->hw_if_index);
1045 
1046  if (msg.size != sizeof (msg.log))
1047  {
1048  DBG_SOCK
1049  ("invalid msg size for VHOST_USER_SET_LOG_BASE: %d instead of %d",
1050  msg.size, sizeof (msg.log));
1051  goto close_socket;
1052  }
1053 
1054  if (!
1056  {
1057  DBG_SOCK
1058  ("VHOST_USER_PROTOCOL_F_LOG_SHMFD not set but VHOST_USER_SET_LOG_BASE received");
1059  goto close_socket;
1060  }
1061 
1062  fd = fds[0];
1063  /* align size to 2M page */
1064  long page_sz = get_huge_page_size (fd);
1065  ssize_t map_sz =
1066  (msg.log.size + msg.log.offset + page_sz) & ~(page_sz - 1);
1067 
1068  vui->log_base_addr = mmap (0, map_sz, PROT_READ | PROT_WRITE,
1069  MAP_SHARED, fd, 0);
1070 
1071  DBG_SOCK
1072  ("map log region addr 0 len 0x%lx off 0x%lx fd %d mapped 0x%lx",
1073  map_sz, msg.log.offset, fd, vui->log_base_addr);
1074 
1075  if (vui->log_base_addr == MAP_FAILED)
1076  {
1077  clib_warning ("failed to map memory. errno is %d", errno);
1078  goto close_socket;
1079  }
1080 
1081  vui->log_base_addr += msg.log.offset;
1082  vui->log_size = msg.log.size;
1083 
1084  msg.flags |= 4;
1085  msg.size = sizeof (msg.u64);
1086 
1087  break;
1088  }
1089 
1090  case VHOST_USER_SET_LOG_FD:
1091  DBG_SOCK ("if %d msg VHOST_USER_SET_LOG_FD", vui->hw_if_index);
1092 
1093  break;
1094 
1096  DBG_SOCK ("if %d msg VHOST_USER_GET_PROTOCOL_FEATURES",
1097  vui->hw_if_index);
1098 
1099  msg.flags |= 4;
1100  msg.u64 = (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |
1101  (1 << VHOST_USER_PROTOCOL_F_MQ);
1102  msg.size = sizeof (msg.u64);
1103  break;
1104 
1106  DBG_SOCK ("if %d msg VHOST_USER_SET_PROTOCOL_FEATURES features 0x%lx",
1107  vui->hw_if_index, msg.u64);
1108 
1109  vui->protocol_features = msg.u64;
1110 
1111  break;
1112 
1114  DBG_SOCK ("if %d msg VHOST_USER_GET_QUEUE_NUM", vui->hw_if_index);
1115  msg.flags |= 4;
1116  msg.u64 = VHOST_VRING_MAX_N;
1117  msg.size = sizeof (msg.u64);
1118  break;
1119 
1121  DBG_SOCK ("if %d VHOST_USER_SET_VRING_ENABLE: %s queue %d",
1122  vui->hw_if_index, msg.state.num ? "enable" : "disable",
1123  msg.state.index);
1124  if (msg.state.index >= VHOST_VRING_MAX_N)
1125  {
1126  DBG_SOCK ("invalid vring index VHOST_USER_SET_VRING_ENABLE:"
1127  " %d >= %d", msg.state.index, VHOST_VRING_MAX_N);
1128  goto close_socket;
1129  }
1130 
1131  vui->vrings[msg.state.index].enabled = msg.state.num;
1132  break;
1133 
1134  default:
1135  DBG_SOCK ("unknown vhost-user message %d received. closing socket",
1136  msg.request);
1137  goto close_socket;
1138  }
1139 
1140  /* if we need to reply */
1141  if (msg.flags & 4)
1142  {
1143  n =
1144  send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
1145  if (n != (msg.size + VHOST_USER_MSG_HDR_SZ))
1146  {
1147  DBG_SOCK ("could not send message response");
1148  goto close_socket;
1149  }
1150  }
1151 
1154  return 0;
1155 
1156 close_socket:
1160  return 0;
1161 }
1162 
1163 static clib_error_t *
1165 {
1166  vlib_main_t *vm = vlib_get_main ();
1168  vhost_user_intf_t *vui =
1170 
1171  DBG_SOCK ("socket error on if %d", vui->sw_if_index);
1176  return 0;
1177 }
1178 
1179 static clib_error_t *
1181 {
1182  int client_fd, client_len;
1183  struct sockaddr_un client;
1184  unix_file_t template = { 0 };
1186  vhost_user_intf_t *vui;
1187 
1189 
1190  client_len = sizeof (client);
1191  client_fd = accept (uf->file_descriptor,
1192  (struct sockaddr *) &client,
1193  (socklen_t *) & client_len);
1194 
1195  if (client_fd < 0)
1196  return clib_error_return_unix (0, "accept");
1197 
1198  DBG_SOCK ("New client socket for vhost interface %d", vui->sw_if_index);
1199  template.read_function = vhost_user_socket_read;
1200  template.error_function = vhost_user_socket_error;
1201  template.file_descriptor = client_fd;
1202  template.private_data = vui - vhost_user_main.vhost_user_interfaces;
1203  vui->unix_file_index = unix_file_add (&unix_main, &template);
1204  return 0;
1205 }
1206 
1207 static clib_error_t *
1209 {
1210  clib_error_t *error;
1214  uword *p;
1215 
1216  error = vlib_call_init_function (vm, ip4_init);
1217  if (error)
1218  return error;
1219 
1220  vum->coalesce_frames = 32;
1221  vum->coalesce_time = 1e-3;
1222 
1223  vec_validate (vum->cpus, tm->n_vlib_mains - 1);
1224 
1225  vhost_cpu_t *cpu;
1226  vec_foreach (cpu, vum->cpus)
1227  {
1228  /* This is actually not necessary as validate already zeroes it
1229  * Just keeping the loop here for later because I am lazy. */
1230  cpu->rx_buffers_len = 0;
1231  }
1232 
1233  /* find out which cpus will be used for input */
1234  vum->input_cpu_first_index = 0;
1235  vum->input_cpu_count = 1;
1236  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
1237  tr = p ? (vlib_thread_registration_t *) p[0] : 0;
1238 
1239  if (tr && tr->count > 0)
1240  {
1242  vum->input_cpu_count = tr->count;
1243  }
1244 
1245  vum->random = random_default_seed ();
1246 
1247  return 0;
1248 }
1249 
1251 
1252 static clib_error_t *
1254 {
1255  /* TODO cleanup */
1256  return 0;
1257 }
1258 
1260 
1261 static u8 *
1262 format_vhost_trace (u8 * s, va_list * va)
1263 {
1264  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
1265  CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
1266  CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
1268  vhost_trace_t *t = va_arg (*va, vhost_trace_t *);
1270  t->device_index);
1271 
1273 
1274  uword indent = format_get_indent (s);
1275 
1276  s = format (s, "%U %U queue %d\n", format_white_space, indent,
1277  format_vnet_sw_interface_name, vnm, sw, t->qid);
1278 
1279  s = format (s, "%U virtio flags:\n", format_white_space, indent);
1280 #define _(n,i,st) \
1281  if (t->virtio_ring_flags & (1 << VIRTIO_TRACE_F_##n)) \
1282  s = format (s, "%U %s %s\n", format_white_space, indent, #n, st);
1284 #undef _
1285  s = format (s, "%U virtio_net_hdr first_desc_len %u\n",
1286  format_white_space, indent, t->first_desc_len);
1287 
1288  s = format (s, "%U flags 0x%02x gso_type %u\n",
1289  format_white_space, indent,
1290  t->hdr.hdr.flags, t->hdr.hdr.gso_type);
1291 
1292  if (vui->virtio_net_hdr_sz == 12)
1293  s = format (s, "%U num_buff %u",
1294  format_white_space, indent, t->hdr.num_buffers);
1295 
1296  return s;
1297 }
1298 
1299 void
1301  vhost_user_intf_t * vui, u16 qid,
1302  vlib_buffer_t * b, vhost_user_vring_t * txvq)
1303 {
1305  u32 qsz_mask = txvq->qsz - 1;
1306  u32 last_avail_idx = txvq->last_avail_idx;
1307  u32 desc_current = txvq->avail->ring[last_avail_idx & qsz_mask];
1308  vring_desc_t *hdr_desc = 0;
1309  virtio_net_hdr_mrg_rxbuf_t *hdr;
1310  u32 hint = 0;
1311 
1312  memset (t, 0, sizeof (*t));
1313  t->device_index = vui - vum->vhost_user_interfaces;
1314  t->qid = qid;
1315 
1316  hdr_desc = &txvq->desc[desc_current];
1317  if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
1318  {
1319  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
1320  /* Header is the first here */
1321  hdr_desc = map_guest_mem (vui, txvq->desc[desc_current].addr, &hint);
1322  }
1323  if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT)
1324  {
1325  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
1326  }
1327  if (!(txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT) &&
1328  !(txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT))
1329  {
1330  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
1331  }
1332 
1333  t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
1334 
1335  if (!hdr_desc || !(hdr = map_guest_mem (vui, hdr_desc->addr, &hint)))
1336  {
1337  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_MAP_ERROR;
1338  }
1339  else
1340  {
1341  u32 len = vui->virtio_net_hdr_sz;
1342  memcpy (&t->hdr, hdr, len > hdr_desc->len ? hdr_desc->len : len);
1343  }
1344 }
1345 
1346 static inline void
1348 {
1350  u64 x = 1;
1351  int fd = UNIX_GET_FD (vq->callfd_idx);
1352  int rv __attribute__ ((unused));
1353  /* TODO: pay attention to rv */
1354  rv = write (fd, &x, sizeof (x));
1355  vq->n_since_last_int = 0;
1356  vq->int_deadline = vlib_time_now (vm) + vum->coalesce_time;
1357 }
1358 
1361  u16 copy_len, u32 * map_hint)
1362 {
1363  void *src0, *src1, *src2, *src3;
1364  if (PREDICT_TRUE (copy_len >= 4))
1365  {
1366  if (PREDICT_FALSE (!(src2 = map_guest_mem (vui, cpy[0].src, map_hint))))
1367  return 1;
1368  if (PREDICT_FALSE (!(src3 = map_guest_mem (vui, cpy[1].src, map_hint))))
1369  return 1;
1370 
1371  while (PREDICT_TRUE (copy_len >= 4))
1372  {
1373  src0 = src2;
1374  src1 = src3;
1375 
1376  if (PREDICT_FALSE
1377  (!(src2 = map_guest_mem (vui, cpy[2].src, map_hint))))
1378  return 1;
1379  if (PREDICT_FALSE
1380  (!(src3 = map_guest_mem (vui, cpy[3].src, map_hint))))
1381  return 1;
1382 
1383  CLIB_PREFETCH (src2, 64, LOAD);
1384  CLIB_PREFETCH (src3, 64, LOAD);
1385 
1386  clib_memcpy ((void *) cpy[0].dst, src0, cpy[0].len);
1387  clib_memcpy ((void *) cpy[1].dst, src1, cpy[1].len);
1388  copy_len -= 2;
1389  cpy += 2;
1390  }
1391  }
1392  while (copy_len)
1393  {
1394  if (PREDICT_FALSE (!(src0 = map_guest_mem (vui, cpy->src, map_hint))))
1395  return 1;
1396  clib_memcpy ((void *) cpy->dst, src0, cpy->len);
1397  copy_len -= 1;
1398  cpy += 1;
1399  }
1400  return 0;
1401 }
1402 
1403 /**
1404  * Try to discard packets from the tx ring (VPP RX path).
1405  * Returns the number of discarded packets.
1406  */
1407 u32
1409  vhost_user_intf_t * vui,
1410  vhost_user_vring_t * txvq, u32 discard_max)
1411 {
1412  /*
1413  * On the RX side, each packet corresponds to one descriptor
1414  * (it is the same whether it is a shallow descriptor, chained, or indirect).
1415  * Therefore, discarding a packet is like discarding a descriptor.
1416  */
1417  u32 discarded_packets = 0;
1418  u32 avail_idx = txvq->avail->idx;
1419  u16 qsz_mask = txvq->qsz - 1;
1420  while (discarded_packets != discard_max)
1421  {
1422  if (avail_idx == txvq->last_avail_idx)
1423  goto out;
1424 
1425  u16 desc_chain_head =
1426  txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
1427  txvq->last_avail_idx++;
1428  txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head;
1429  txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0;
1430  vhost_user_log_dirty_ring (vui, txvq,
1431  ring[txvq->last_used_idx & qsz_mask]);
1432  txvq->last_used_idx++;
1433  discarded_packets++;
1434  }
1435 
1436 out:
1438  txvq->used->idx = txvq->last_used_idx;
1439  vhost_user_log_dirty_ring (vui, txvq, idx);
1440  return discarded_packets;
1441 }
1442 
1443 /*
1444  * In case of overflow, we need to rewind the array of allocated buffers.
1445  */
1446 static void
1448  vhost_cpu_t * cpu, vlib_buffer_t * b_head)
1449 {
1450  u32 bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
1451  vlib_buffer_t *b_current = vlib_get_buffer (vm, bi_current);
1452  b_current->current_length = 0;
1453  b_current->flags = 0;
1454  while (b_current != b_head)
1455  {
1456  cpu->rx_buffers_len++;
1457  bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
1458  b_current = vlib_get_buffer (vm, bi_current);
1459  b_current->current_length = 0;
1460  b_current->flags = 0;
1461  }
1462 }
1463 
1464 static u32
1466  vhost_user_main_t * vum,
1467  vhost_user_intf_t * vui,
1468  u16 qid, vlib_node_runtime_t * node)
1469 {
1470  vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)];
1471  u16 n_rx_packets = 0;
1472  u32 n_rx_bytes = 0;
1473  u16 n_left;
1474  u32 n_left_to_next, *to_next;
1476  u32 n_trace = vlib_get_trace_count (vm, node);
1477  u16 qsz_mask;
1478  u32 map_hint = 0;
1479  u16 cpu_index = os_get_cpu_number ();
1480  u16 copy_len = 0;
1481 
1482  {
1483  /* do we have pending interrupts ? */
1484  vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)];
1485  f64 now = vlib_time_now (vm);
1486 
1487  if ((txvq->n_since_last_int) && (txvq->int_deadline < now))
1488  vhost_user_send_call (vm, txvq);
1489 
1490  if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now))
1491  vhost_user_send_call (vm, rxvq);
1492  }
1493 
1494  if (PREDICT_FALSE (txvq->avail->flags & 0xFFFE))
1495  return 0;
1496 
1497  n_left = (u16) (txvq->avail->idx - txvq->last_avail_idx);
1498 
1499  /* nothing to do */
1500  if (PREDICT_FALSE (n_left == 0))
1501  return 0;
1502 
1503  if (PREDICT_FALSE (!vui->admin_up || !(txvq->enabled)))
1504  {
1505  /*
1506  * Discard input packet if interface is admin down or vring is not
1507  * enabled.
1508  * "For example, for a networking device, in the disabled state
1509  * client must not supply any new RX packets, but must process
1510  * and discard any TX packets."
1511  */
1512  vhost_user_rx_discard_packet (vm, vui, txvq,
1514  return 0;
1515  }
1516 
1517  if (PREDICT_FALSE (n_left == txvq->qsz))
1518  {
1519  /*
1520  * Informational error logging when VPP is not
1521  * receiving packets fast enough.
1522  */
1523  vlib_error_count (vm, node->node_index,
1524  VHOST_USER_INPUT_FUNC_ERROR_FULL_RX_QUEUE, 1);
1525  }
1526 
1527  qsz_mask = txvq->qsz - 1;
1528 
1529  if (n_left > VLIB_FRAME_SIZE)
1530  n_left = VLIB_FRAME_SIZE;
1531 
1532  /*
1533  * For small packets (<2kB), we will not need more than one vlib buffer
1534  * per packet. In case packets are bigger, we will just yeld at some point
1535  * in the loop and come back later. This is not an issue as for big packet,
1536  * processing cost really comes from the memory copy.
1537  */
1538  if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1))
1539  {
1540  u32 curr_len = vum->cpus[cpu_index].rx_buffers_len;
1541  vum->cpus[cpu_index].rx_buffers_len +=
1543  vum->cpus[cpu_index].rx_buffers +
1544  curr_len,
1545  VHOST_USER_RX_BUFFERS_N - curr_len,
1547 
1548  if (PREDICT_FALSE
1549  (vum->cpus[cpu_index].rx_buffers_len <
1551  {
1552  /* In case of buffer starvation, discard some packets from the queue
1553  * and log the event.
1554  * We keep doing best effort for the remaining packets. */
1555  u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ?
1556  n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1;
1557  flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush);
1558 
1559  n_left -= flush;
1561  interface_main.sw_if_counters +
1563  os_get_cpu_number (),
1564  vui->sw_if_index, flush);
1565 
1567  VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, flush);
1568  }
1569  }
1570 
1571  while (n_left > 0)
1572  {
1573  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1574 
1575  while (n_left > 0 && n_left_to_next > 0)
1576  {
1577  vlib_buffer_t *b_head, *b_current;
1578  u32 bi_current;
1579  u16 desc_current;
1580  u32 desc_data_offset;
1581  vring_desc_t *desc_table = txvq->desc;
1582 
1583  if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1))
1584  {
1585  /* Not enough rx_buffers
1586  * Note: We yeld on 1 so we don't need to do an additional
1587  * check for the next buffer prefetch.
1588  */
1589  n_left = 0;
1590  break;
1591  }
1592 
1593  desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
1594  vum->cpus[cpu_index].rx_buffers_len--;
1595  bi_current = (vum->cpus[cpu_index].rx_buffers)
1596  [vum->cpus[cpu_index].rx_buffers_len];
1597  b_head = b_current = vlib_get_buffer (vm, bi_current);
1598  to_next[0] = bi_current; //We do that now so we can forget about bi_current
1599  to_next++;
1600  n_left_to_next--;
1601 
1603  (vum->cpus[cpu_index].rx_buffers)
1604  [vum->cpus[cpu_index].
1605  rx_buffers_len - 1], LOAD);
1606 
1607  /* Just preset the used descriptor id and length for later */
1608  txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_current;
1609  txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0;
1610  vhost_user_log_dirty_ring (vui, txvq,
1611  ring[txvq->last_used_idx & qsz_mask]);
1612 
1613  /* The buffer should already be initialized */
1616 
1617  if (PREDICT_FALSE (n_trace))
1618  {
1619  //TODO: next_index is not exactly known at that point
1620  vlib_trace_buffer (vm, node, next_index, b_head,
1621  /* follow_chain */ 0);
1622  vhost_trace_t *t0 =
1623  vlib_add_trace (vm, node, b_head, sizeof (t0[0]));
1624  vhost_user_rx_trace (t0, vui, qid, b_head, txvq);
1625  n_trace--;
1626  vlib_set_trace_count (vm, node, n_trace);
1627  }
1628 
1629  /* This depends on the setup but is very consistent
1630  * So I think the CPU branch predictor will make a pretty good job
1631  * at optimizing the decision. */
1632  if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
1633  {
1634  desc_table = map_guest_mem (vui, txvq->desc[desc_current].addr,
1635  &map_hint);
1636  desc_current = 0;
1637  if (PREDICT_FALSE (desc_table == 0))
1638  {
1639  //FIXME: Handle error by shutdown the queue
1640  goto out;
1641  }
1642  }
1643 
1644  if (PREDICT_TRUE (vui->is_any_layout) ||
1645  (!(desc_table[desc_current].flags & VIRTQ_DESC_F_NEXT)))
1646  {
1647  /* ANYLAYOUT or single buffer */
1648  desc_data_offset = vui->virtio_net_hdr_sz;
1649  }
1650  else
1651  {
1652  /* CSR case without ANYLAYOUT, skip 1st buffer */
1653  desc_data_offset = desc_table[desc_current].len;
1654  }
1655 
1656  while (1)
1657  {
1658  /* Get more input if necessary. Or end of packet. */
1659  if (desc_data_offset == desc_table[desc_current].len)
1660  {
1661  if (PREDICT_FALSE (desc_table[desc_current].flags &
1662  VIRTQ_DESC_F_NEXT))
1663  {
1664  desc_current = desc_table[desc_current].next;
1665  desc_data_offset = 0;
1666  }
1667  else
1668  {
1669  goto out;
1670  }
1671  }
1672 
1673  /* Get more output if necessary. Or end of packet. */
1674  if (PREDICT_FALSE
1675  (b_current->current_length == VLIB_BUFFER_DATA_SIZE))
1676  {
1677  if (PREDICT_FALSE
1678  (vum->cpus[cpu_index].rx_buffers_len == 0))
1679  {
1680  /* Cancel speculation */
1681  to_next--;
1682  n_left_to_next++;
1683 
1684  /*
1685  * Checking if there are some left buffers.
1686  * If not, just rewind the used buffers and stop.
1687  * Note: Scheduled copies are not cancelled. This is
1688  * not an issue as they would still be valid. Useless,
1689  * but valid.
1690  */
1692  &vum->cpus[cpu_index],
1693  b_head);
1694  n_left = 0;
1695  goto stop;
1696  }
1697 
1698  /* Get next output */
1699  vum->cpus[cpu_index].rx_buffers_len--;
1700  u32 bi_next =
1701  (vum->cpus[cpu_index].rx_buffers)[vum->cpus
1702  [cpu_index].rx_buffers_len];
1703  b_current->next_buffer = bi_next;
1704  b_current->flags |= VLIB_BUFFER_NEXT_PRESENT;
1705  bi_current = bi_next;
1706  b_current = vlib_get_buffer (vm, bi_current);
1707  }
1708 
1709  /* Prepare a copy order executed later for the data */
1710  vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
1711  copy_len++;
1712  u32 desc_data_l =
1713  desc_table[desc_current].len - desc_data_offset;
1714  cpy->len = VLIB_BUFFER_DATA_SIZE - b_current->current_length;
1715  cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len;
1716  cpy->dst = (uword) vlib_buffer_get_current (b_current);
1717  cpy->src = desc_table[desc_current].addr + desc_data_offset;
1718 
1719  desc_data_offset += cpy->len;
1720 
1721  b_current->current_length += cpy->len;
1723  }
1724 
1725  out:
1726  CLIB_PREFETCH (&n_left, sizeof (n_left), LOAD);
1727 
1728  n_rx_bytes += b_head->total_length_not_including_first_buffer;
1729  n_rx_packets++;
1730 
1732  b_head->current_length;
1733 
1734  /* consume the descriptor and return it as used */
1735  txvq->last_avail_idx++;
1736  txvq->last_used_idx++;
1737 
1739 
1740  vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index;
1741  vnet_buffer (b_head)->sw_if_index[VLIB_TX] = (u32) ~ 0;
1742  b_head->error = 0;
1743 
1744  {
1746 
1747  /* redirect if feature path enabled */
1749  b_head, 0);
1750 
1751  u32 bi = to_next[-1]; //Cannot use to_next[-1] in the macro
1752  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1753  to_next, n_left_to_next,
1754  bi, next0);
1755  }
1756 
1757  n_left--;
1758 
1759  /*
1760  * Although separating memory copies from virtio ring parsing
1761  * is beneficial, we can offer to perform the copies from time
1762  * to time in order to free some space in the ring.
1763  */
1764  if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD))
1765  {
1766  if (PREDICT_FALSE
1767  (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy,
1768  copy_len, &map_hint)))
1769  {
1770  clib_warning
1771  ("Memory mapping error on interface hw_if_index=%d "
1772  "(Shutting down - Switch interface down and up to restart)",
1773  vui->hw_if_index);
1774  vui->admin_up = 0;
1775  copy_len = 0;
1776  break;
1777  }
1778  copy_len = 0;
1779 
1780  /* give buffers back to driver */
1782  txvq->used->idx = txvq->last_used_idx;
1783  vhost_user_log_dirty_ring (vui, txvq, idx);
1784  }
1785  }
1786  stop:
1787  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1788  }
1789 
1790  /* Do the memory copies */
1791  if (PREDICT_FALSE
1792  (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy,
1793  copy_len, &map_hint)))
1794  {
1795  clib_warning ("Memory mapping error on interface hw_if_index=%d "
1796  "(Shutting down - Switch interface down and up to restart)",
1797  vui->hw_if_index);
1798  vui->admin_up = 0;
1799  }
1800 
1801  /* give buffers back to driver */
1803  txvq->used->idx = txvq->last_used_idx;
1804  vhost_user_log_dirty_ring (vui, txvq, idx);
1805 
1806  /* interrupt (call) handling */
1807  if ((txvq->callfd_idx != ~0) && !(txvq->avail->flags & 1))
1808  {
1809  txvq->n_since_last_int += n_rx_packets;
1810 
1811  if (txvq->n_since_last_int > vum->coalesce_frames)
1812  vhost_user_send_call (vm, txvq);
1813  }
1814 
1815  /* increase rx counters */
1819  os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes);
1820 
1821  return n_rx_packets;
1822 }
1823 
1824 static uword
1826  vlib_node_runtime_t * node, vlib_frame_t * f)
1827 {
1829  uword n_rx_packets = 0;
1830  u32 cpu_index = os_get_cpu_number ();
1831 
1832 
1834  vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues)
1835  {
1836  vhost_user_intf_t *vui =
1838  n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node);
1839  }
1840 
1841  return n_rx_packets;
1842 }
1843 
1844 /* *INDENT-OFF* */
1846  .function = vhost_user_input,
1847  .type = VLIB_NODE_TYPE_INPUT,
1848  .name = "vhost-user-input",
1849  .sibling_of = "device-input",
1850 
1851  /* Will be enabled if/when hardware is detected. */
1852  .state = VLIB_NODE_STATE_DISABLED,
1853 
1854  .format_buffer = format_ethernet_header_with_length,
1855  .format_trace = format_vhost_trace,
1856 
1857  .n_errors = VHOST_USER_INPUT_FUNC_N_ERROR,
1858  .error_strings = vhost_user_input_func_error_strings,
1859 };
1860 
1862 /* *INDENT-ON* */
1863 
1864 
1865 void
1867  vhost_user_intf_t * vui, u16 qid,
1868  vlib_buffer_t * b, vhost_user_vring_t * rxvq)
1869 {
1871  u32 qsz_mask = rxvq->qsz - 1;
1872  u32 last_avail_idx = rxvq->last_avail_idx;
1873  u32 desc_current = rxvq->avail->ring[last_avail_idx & qsz_mask];
1874  vring_desc_t *hdr_desc = 0;
1875  u32 hint = 0;
1876 
1877  memset (t, 0, sizeof (*t));
1878  t->device_index = vui - vum->vhost_user_interfaces;
1879  t->qid = qid;
1880 
1881  hdr_desc = &rxvq->desc[desc_current];
1882  if (rxvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
1883  {
1884  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
1885  /* Header is the first here */
1886  hdr_desc = map_guest_mem (vui, rxvq->desc[desc_current].addr, &hint);
1887  }
1888  if (rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT)
1889  {
1890  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
1891  }
1892  if (!(rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT) &&
1893  !(rxvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT))
1894  {
1895  t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
1896  }
1897 
1898  t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
1899 }
1900 
1903  u16 copy_len, u32 * map_hint)
1904 {
1905  void *dst0, *dst1, *dst2, *dst3;
1906  if (PREDICT_TRUE (copy_len >= 4))
1907  {
1908  if (PREDICT_FALSE (!(dst2 = map_guest_mem (vui, cpy[0].dst, map_hint))))
1909  return 1;
1910  if (PREDICT_FALSE (!(dst3 = map_guest_mem (vui, cpy[1].dst, map_hint))))
1911  return 1;
1912  while (PREDICT_TRUE (copy_len >= 4))
1913  {
1914  dst0 = dst2;
1915  dst1 = dst3;
1916 
1917  if (PREDICT_FALSE
1918  (!(dst2 = map_guest_mem (vui, cpy[2].dst, map_hint))))
1919  return 1;
1920  if (PREDICT_FALSE
1921  (!(dst3 = map_guest_mem (vui, cpy[3].dst, map_hint))))
1922  return 1;
1923 
1924  CLIB_PREFETCH ((void *) cpy[2].src, 64, LOAD);
1925  CLIB_PREFETCH ((void *) cpy[3].src, 64, LOAD);
1926 
1927  clib_memcpy (dst0, (void *) cpy[0].src, cpy[0].len);
1928  clib_memcpy (dst1, (void *) cpy[1].src, cpy[1].len);
1929 
1930  vhost_user_log_dirty_pages_2 (vui, cpy[0].dst, cpy[0].len, 1);
1931  vhost_user_log_dirty_pages_2 (vui, cpy[1].dst, cpy[1].len, 1);
1932  copy_len -= 2;
1933  cpy += 2;
1934  }
1935  }
1936  while (copy_len)
1937  {
1938  if (PREDICT_FALSE (!(dst0 = map_guest_mem (vui, cpy->dst, map_hint))))
1939  return 1;
1940  clib_memcpy (dst0, (void *) cpy->src, cpy->len);
1941  vhost_user_log_dirty_pages_2 (vui, cpy->dst, cpy->len, 1);
1942  copy_len -= 1;
1943  cpy += 1;
1944  }
1945  return 0;
1946 }
1947 
1948 
1949 static uword
1951  vlib_node_runtime_t * node, vlib_frame_t * frame)
1952 {
1953  u32 *buffers = vlib_frame_args (frame);
1954  u32 n_left = frame->n_vectors;
1956  vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
1957  vhost_user_intf_t *vui =
1959  u32 qid = ~0;
1960  vhost_user_vring_t *rxvq;
1961  u16 qsz_mask;
1962  u8 error;
1963  u32 cpu_index = os_get_cpu_number ();
1964  u32 map_hint = 0;
1965  u8 retry = 8;
1966  u16 copy_len;
1967  u16 tx_headers_len;
1968 
1969  if (PREDICT_FALSE (!vui->admin_up))
1970  {
1971  error = VHOST_USER_TX_FUNC_ERROR_DOWN;
1972  goto done3;
1973  }
1974 
1975  if (PREDICT_FALSE (!vui->is_up))
1976  {
1977  error = VHOST_USER_TX_FUNC_ERROR_NOT_READY;
1978  goto done3;
1979  }
1980 
1981  qid =
1983  (vui->per_cpu_tx_qid, os_get_cpu_number ()));
1984  rxvq = &vui->vrings[qid];
1985  if (PREDICT_FALSE (vui->use_tx_spinlock))
1986  vhost_user_vring_lock (vui, qid);
1987 
1988  qsz_mask = rxvq->qsz - 1; /* qsz is always power of 2 */
1989 
1990 retry:
1991  error = VHOST_USER_TX_FUNC_ERROR_NONE;
1992  tx_headers_len = 0;
1993  copy_len = 0;
1994  while (n_left > 0)
1995  {
1996  vlib_buffer_t *b0, *current_b0;
1997  u16 desc_head, desc_index, desc_len;
1998  vring_desc_t *desc_table;
1999  uword buffer_map_addr;
2000  u32 buffer_len;
2001  u16 bytes_left;
2002 
2003  if (PREDICT_TRUE (n_left > 1))
2004  vlib_prefetch_buffer_with_index (vm, buffers[1], LOAD);
2005 
2006  b0 = vlib_get_buffer (vm, buffers[0]);
2007 
2009  {
2010  vum->cpus[cpu_index].current_trace =
2011  vlib_add_trace (vm, node, b0,
2012  sizeof (*vum->cpus[cpu_index].current_trace));
2013  vhost_user_tx_trace (vum->cpus[cpu_index].current_trace,
2014  vui, qid / 2, b0, rxvq);
2015  }
2016 
2017  if (PREDICT_FALSE (rxvq->last_avail_idx == rxvq->avail->idx))
2018  {
2019  error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
2020  goto done;
2021  }
2022 
2023  desc_table = rxvq->desc;
2024  desc_head = desc_index =
2025  rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
2026 
2027  /* Go deeper in case of indirect descriptor
2028  * I don't know of any driver providing indirect for RX. */
2029  if (PREDICT_FALSE (rxvq->desc[desc_head].flags & VIRTQ_DESC_F_INDIRECT))
2030  {
2031  if (PREDICT_FALSE
2032  (rxvq->desc[desc_head].len < sizeof (vring_desc_t)))
2033  {
2034  error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
2035  goto done;
2036  }
2037  if (PREDICT_FALSE
2038  (!(desc_table =
2039  map_guest_mem (vui, rxvq->desc[desc_index].addr,
2040  &map_hint))))
2041  {
2042  error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
2043  goto done;
2044  }
2045  desc_index = 0;
2046  }
2047 
2048  desc_len = vui->virtio_net_hdr_sz;
2049  buffer_map_addr = desc_table[desc_index].addr;
2050  buffer_len = desc_table[desc_index].len;
2051 
2052  {
2053  // Get a header from the header array
2054  virtio_net_hdr_mrg_rxbuf_t *hdr =
2055  &vum->cpus[cpu_index].tx_headers[tx_headers_len];
2056  tx_headers_len++;
2057  hdr->hdr.flags = 0;
2058  hdr->hdr.gso_type = 0;
2059  hdr->num_buffers = 1; //This is local, no need to check
2060 
2061  // Prepare a copy order executed later for the header
2062  vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
2063  copy_len++;
2064  cpy->len = vui->virtio_net_hdr_sz;
2065  cpy->dst = buffer_map_addr;
2066  cpy->src = (uword) hdr;
2067  }
2068 
2069  buffer_map_addr += vui->virtio_net_hdr_sz;
2070  buffer_len -= vui->virtio_net_hdr_sz;
2071  bytes_left = b0->current_length;
2072  current_b0 = b0;
2073  while (1)
2074  {
2075  if (buffer_len == 0)
2076  { //Get new output
2077  if (desc_table[desc_index].flags & VIRTQ_DESC_F_NEXT)
2078  {
2079  //Next one is chained
2080  desc_index = desc_table[desc_index].next;
2081  buffer_map_addr = desc_table[desc_index].addr;
2082  buffer_len = desc_table[desc_index].len;
2083  }
2084  else if (vui->virtio_net_hdr_sz == 12) //MRG is available
2085  {
2086  virtio_net_hdr_mrg_rxbuf_t *hdr =
2087  &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1];
2088 
2089  //Move from available to used buffer
2090  rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id =
2091  desc_head;
2092  rxvq->used->ring[rxvq->last_used_idx & qsz_mask].len =
2093  desc_len;
2094  vhost_user_log_dirty_ring (vui, rxvq,
2095  ring[rxvq->last_used_idx &
2096  qsz_mask]);
2097 
2098  rxvq->last_avail_idx++;
2099  rxvq->last_used_idx++;
2100  hdr->num_buffers++;
2101  desc_len = 0;
2102 
2103  if (PREDICT_FALSE
2104  (rxvq->last_avail_idx == rxvq->avail->idx))
2105  {
2106  //Dequeue queued descriptors for this packet
2107  rxvq->last_used_idx -= hdr->num_buffers - 1;
2108  rxvq->last_avail_idx -= hdr->num_buffers - 1;
2109  error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
2110  goto done;
2111  }
2112 
2113  desc_table = rxvq->desc;
2114  desc_head = desc_index =
2115  rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
2116  if (PREDICT_FALSE
2117  (rxvq->desc[desc_head].flags & VIRTQ_DESC_F_INDIRECT))
2118  {
2119  //It is seriously unlikely that a driver will put indirect descriptor
2120  //after non-indirect descriptor.
2121  if (PREDICT_FALSE
2122  (rxvq->desc[desc_head].len < sizeof (vring_desc_t)))
2123  {
2124  error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
2125  goto done;
2126  }
2127  if (PREDICT_FALSE
2128  (!(desc_table =
2129  map_guest_mem (vui,
2130  rxvq->desc[desc_index].addr,
2131  &map_hint))))
2132  {
2133  error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
2134  goto done;
2135  }
2136  desc_index = 0;
2137  }
2138  buffer_map_addr = desc_table[desc_index].addr;
2139  buffer_len = desc_table[desc_index].len;
2140  }
2141  else
2142  {
2143  error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOMRG;
2144  goto done;
2145  }
2146  }
2147 
2148  {
2149  vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
2150  copy_len++;
2151  cpy->len = bytes_left;
2152  cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len;
2153  cpy->dst = buffer_map_addr;
2154  cpy->src = (uword) vlib_buffer_get_current (current_b0) +
2155  current_b0->current_length - bytes_left;
2156 
2157  bytes_left -= cpy->len;
2158  buffer_len -= cpy->len;
2159  buffer_map_addr += cpy->len;
2160  desc_len += cpy->len;
2161 
2162  CLIB_PREFETCH (&rxvq->desc, CLIB_CACHE_LINE_BYTES, LOAD);
2163  }
2164 
2165  // Check if vlib buffer has more data. If not, get more or break.
2166  if (PREDICT_TRUE (!bytes_left))
2167  {
2168  if (PREDICT_FALSE
2169  (current_b0->flags & VLIB_BUFFER_NEXT_PRESENT))
2170  {
2171  current_b0 = vlib_get_buffer (vm, current_b0->next_buffer);
2172  bytes_left = current_b0->current_length;
2173  }
2174  else
2175  {
2176  //End of packet
2177  break;
2178  }
2179  }
2180  }
2181 
2182  //Move from available to used ring
2183  rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = desc_head;
2184  rxvq->used->ring[rxvq->last_used_idx & qsz_mask].len = desc_len;
2185  vhost_user_log_dirty_ring (vui, rxvq,
2186  ring[rxvq->last_used_idx & qsz_mask]);
2187  rxvq->last_avail_idx++;
2188  rxvq->last_used_idx++;
2189 
2191  {
2192  vum->cpus[cpu_index].current_trace->hdr =
2193  vum->cpus[cpu_index].tx_headers[tx_headers_len - 1];
2194  }
2195 
2196  n_left--; //At the end for error counting when 'goto done' is invoked
2197  buffers++;
2198  }
2199 
2200 done:
2201  //Do the memory copies
2202  if (PREDICT_FALSE
2203  (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy,
2204  copy_len, &map_hint)))
2205  {
2206  clib_warning ("Memory mapping error on interface hw_if_index=%d "
2207  "(Shutting down - Switch interface down and up to restart)",
2208  vui->hw_if_index);
2209  vui->admin_up = 0;
2210  }
2211 
2213  rxvq->used->idx = rxvq->last_used_idx;
2214  vhost_user_log_dirty_ring (vui, rxvq, idx);
2215 
2216  /*
2217  * When n_left is set, error is always set to something too.
2218  * In case error is due to lack of remaining buffers, we go back up and
2219  * retry.
2220  * The idea is that it is better to waste some time on packets
2221  * that have been processed already than dropping them and get
2222  * more fresh packets with a good likelyhood that they will be dropped too.
2223  * This technique also gives more time to VM driver to pick-up packets.
2224  * In case the traffic flows from physical to virtual interfaces, this
2225  * technique will end-up leveraging the physical NIC buffer in order to
2226  * absorb the VM's CPU jitter.
2227  */
2228  if (n_left && (error == VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF) && retry)
2229  {
2230  retry--;
2231  goto retry;
2232  }
2233 
2234  /* interrupt (call) handling */
2235  if ((rxvq->callfd_idx != ~0) && !(rxvq->avail->flags & 1))
2236  {
2237  rxvq->n_since_last_int += frame->n_vectors - n_left;
2238 
2239  if (rxvq->n_since_last_int > vum->coalesce_frames)
2240  vhost_user_send_call (vm, rxvq);
2241  }
2242 
2243  vhost_user_vring_unlock (vui, qid);
2244 
2245 done3:
2246  if (PREDICT_FALSE (n_left && error != VHOST_USER_TX_FUNC_ERROR_NONE))
2247  {
2248  vlib_error_count (vm, node->node_index, error, n_left);
2252  os_get_cpu_number (), vui->sw_if_index, n_left);
2253  }
2254 
2255  vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
2256  return frame->n_vectors;
2257 }
2258 
2259 static clib_error_t *
2261  u32 flags)
2262 {
2263  vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
2264  uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
2266  vhost_user_intf_t *vui =
2268 
2269  vui->admin_up = is_up;
2270 
2271  if (is_up)
2274 
2275  return /* no error */ 0;
2276 }
2277 
2278 /* *INDENT-OFF* */
2279 VNET_DEVICE_CLASS (vhost_user_dev_class,static) = {
2280  .name = "vhost-user",
2281  .tx_function = vhost_user_tx,
2282  .tx_function_n_errors = VHOST_USER_TX_FUNC_N_ERROR,
2283  .tx_function_error_strings = vhost_user_tx_func_error_strings,
2284  .format_device_name = format_vhost_user_interface_name,
2285  .name_renumber = vhost_user_name_renumber,
2286  .admin_up_down_function = vhost_user_interface_admin_up_down,
2287  .format_tx_trace = format_vhost_trace,
2288 };
2289 
2291  vhost_user_tx)
2292 /* *INDENT-ON* */
2293 
2294 static uword
2295 vhost_user_process (vlib_main_t * vm,
2297 {
2299  vhost_user_intf_t *vui;
2300  struct sockaddr_un sun;
2301  int sockfd;
2302  unix_file_t template = { 0 };
2303  f64 timeout = 3153600000.0 /* 100 years */ ;
2304  uword *event_data = 0;
2305 
2306  sockfd = socket (AF_UNIX, SOCK_STREAM, 0);
2307  sun.sun_family = AF_UNIX;
2308  template.read_function = vhost_user_socket_read;
2309  template.error_function = vhost_user_socket_error;
2310 
2311  if (sockfd < 0)
2312  return 0;
2313 
2314  while (1)
2315  {
2317  vlib_process_get_events (vm, &event_data);
2318  vec_reset_length (event_data);
2319 
2320  timeout = 3.0;
2321 
2322  /* *INDENT-OFF* */
2323  pool_foreach (vui, vum->vhost_user_interfaces, {
2324 
2325  if (vui->unix_server_index == ~0) { //Nothing to do for server sockets
2326  if (vui->unix_file_index == ~0)
2327  {
2328  /* try to connect */
2329  strncpy (sun.sun_path, (char *) vui->sock_filename,
2330  sizeof (sun.sun_path) - 1);
2331 
2332  /* Avoid hanging VPP if the other end does not accept */
2333  fcntl(sockfd, F_SETFL, O_NONBLOCK);
2334  if (connect (sockfd, (struct sockaddr *) &sun,
2335  sizeof (struct sockaddr_un)) == 0)
2336  {
2337  /* Set the socket to blocking as it was before */
2338  fcntl(sockfd, F_SETFL, 0);
2339  vui->sock_errno = 0;
2340  template.file_descriptor = sockfd;
2341  template.private_data =
2342  vui - vhost_user_main.vhost_user_interfaces;
2343  vui->unix_file_index = unix_file_add (&unix_main, &template);
2344 
2345  //Re-open for next connect
2346  if ((sockfd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0) {
2347  clib_warning("Critical: Could not open unix socket");
2348  return 0;
2349  }
2350  }
2351  else
2352  {
2353  vui->sock_errno = errno;
2354  }
2355  }
2356  else
2357  {
2358  /* check if socket is alive */
2359  int error = 0;
2360  socklen_t len = sizeof (error);
2361  int fd = UNIX_GET_FD(vui->unix_file_index);
2362  int retval =
2363  getsockopt (fd, SOL_SOCKET, SO_ERROR, &error, &len);
2364 
2365  if (retval)
2366  {
2367  DBG_SOCK ("getsockopt returned %d", retval);
2368  vhost_user_if_disconnect (vui);
2369  }
2370  }
2371  }
2372  });
2373  /* *INDENT-ON* */
2374  }
2375  return 0;
2376 }
2377 
2378 /* *INDENT-OFF* */
2380  .function = vhost_user_process,
2381  .type = VLIB_NODE_TYPE_PROCESS,
2382  .name = "vhost-user-process",
2383 };
2384 /* *INDENT-ON* */
2385 
2386 /**
2387  * Disables and reset interface structure.
2388  * It can then be either init again, or removed from used interfaces.
2389  */
2390 static void
2392 {
2393  // Delete configured thread pinning
2394  vec_reset_length (vui->workers);
2395  // disconnect interface sockets
2398 
2399  if (vui->unix_server_index != ~0)
2400  {
2401  //Close server socket
2403  vui->unix_server_index);
2404  unix_file_del (&unix_main, uf);
2405  vui->unix_server_index = ~0;
2406  }
2407 }
2408 
2409 int
2411 {
2413  vhost_user_intf_t *vui;
2414  int rv = 0;
2415  vnet_hw_interface_t *hwif;
2416 
2417  if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) ||
2418  hwif->dev_class_index != vhost_user_dev_class.index)
2419  return VNET_API_ERROR_INVALID_SW_IF_INDEX;
2420 
2421  DBG_SOCK ("Deleting vhost-user interface %s (instance %d)",
2422  hwif->name, hwif->dev_instance);
2423 
2425 
2426  // Disable and reset interface
2427  vhost_user_term_if (vui);
2428 
2429  // Reset renumbered iface
2430  if (hwif->dev_instance <
2433 
2434  // Delete ethernet interface
2436 
2437  // Back to pool
2438  pool_put (vum->vhost_user_interfaces, vui);
2439  return rv;
2440 }
2441 
2442 /**
2443  * Open server unix socket on specified sock_filename.
2444  */
2445 static int
2446 vhost_user_init_server_sock (const char *sock_filename, int *sock_fd)
2447 {
2448  int rv = 0;
2449  struct sockaddr_un un = { };
2450  int fd;
2451  /* create listening socket */
2452  if ((fd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
2453  return VNET_API_ERROR_SYSCALL_ERROR_1;
2454 
2455  un.sun_family = AF_UNIX;
2456  strncpy ((char *) un.sun_path, (char *) sock_filename,
2457  sizeof (un.sun_path) - 1);
2458 
2459  /* remove if exists */
2460  unlink ((char *) sock_filename);
2461 
2462  if (bind (fd, (struct sockaddr *) &un, sizeof (un)) == -1)
2463  {
2464  rv = VNET_API_ERROR_SYSCALL_ERROR_2;
2465  goto error;
2466  }
2467 
2468  if (listen (fd, 1) == -1)
2469  {
2470  rv = VNET_API_ERROR_SYSCALL_ERROR_3;
2471  goto error;
2472  }
2473 
2474  *sock_fd = fd;
2475  return 0;
2476 
2477 error:
2478  close (fd);
2479  return rv;
2480 }
2481 
2482 /**
2483  * Create ethernet interface for vhost user interface.
2484  */
2485 static void
2487  vhost_user_intf_t * vui, u8 * hwaddress)
2488 {
2490  u8 hwaddr[6];
2491  clib_error_t *error;
2492 
2493  /* create hw and sw interface */
2494  if (hwaddress)
2495  {
2496  clib_memcpy (hwaddr, hwaddress, 6);
2497  }
2498  else
2499  {
2500  random_u32 (&vum->random);
2501  clib_memcpy (hwaddr + 2, &vum->random, sizeof (vum->random));
2502  hwaddr[0] = 2;
2503  hwaddr[1] = 0xfe;
2504  }
2505 
2507  (vnm,
2508  vhost_user_dev_class.index,
2509  vui - vum->vhost_user_interfaces /* device instance */ ,
2510  hwaddr /* ethernet address */ ,
2511  &vui->hw_if_index, 0 /* flag change */ );
2512 
2513  if (error)
2514  clib_error_report (error);
2515 
2518 }
2519 
2520 /*
2521  * Initialize vui with specified attributes
2522  */
2523 static void
2525  vhost_user_intf_t * vui,
2526  int server_sock_fd,
2527  const char *sock_filename,
2528  u64 feature_mask, u32 * sw_if_index)
2529 {
2530  vnet_sw_interface_t *sw;
2531  sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index);
2532  int q;
2533 
2534  if (server_sock_fd != -1)
2535  {
2536  unix_file_t template = { 0 };
2538  template.file_descriptor = server_sock_fd;
2539  template.private_data = vui - vhost_user_main.vhost_user_interfaces; //hw index
2540  vui->unix_server_index = unix_file_add (&unix_main, &template);
2541  }
2542  else
2543  {
2544  vui->unix_server_index = ~0;
2545  }
2546 
2547  vui->sw_if_index = sw->sw_if_index;
2548  strncpy (vui->sock_filename, sock_filename,
2549  ARRAY_LEN (vui->sock_filename) - 1);
2550  vui->sock_errno = 0;
2551  vui->is_up = 0;
2552  vui->feature_mask = feature_mask;
2553  vui->unix_file_index = ~0;
2554  vui->log_base_addr = 0;
2555 
2556  for (q = 0; q < VHOST_VRING_MAX_N; q++)
2557  vhost_user_vring_init (vui, q);
2558 
2560 
2561  if (sw_if_index)
2562  *sw_if_index = vui->sw_if_index;
2563 
2564  for (q = 0; q < VHOST_VRING_MAX_N; q++)
2565  {
2568  memset ((void *) vui->vring_locks[q], 0, CLIB_CACHE_LINE_BYTES);
2569  }
2570 
2572  vlib_get_thread_main ()->n_vlib_mains - 1);
2574 }
2575 
2576 int
2578  const char *sock_filename,
2579  u8 is_server,
2580  u32 * sw_if_index,
2581  u64 feature_mask,
2582  u8 renumber, u32 custom_dev_instance, u8 * hwaddr)
2583 {
2584  vhost_user_intf_t *vui = NULL;
2585  u32 sw_if_idx = ~0;
2586  int rv = 0;
2587  int server_sock_fd = -1;
2588 
2589  if (is_server)
2590  {
2591  if ((rv =
2592  vhost_user_init_server_sock (sock_filename, &server_sock_fd)) != 0)
2593  {
2594  return rv;
2595  }
2596  }
2597 
2598  pool_get (vhost_user_main.vhost_user_interfaces, vui);
2599 
2600  vhost_user_create_ethernet (vnm, vm, vui, hwaddr);
2601  vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename,
2602  feature_mask, &sw_if_idx);
2603 
2604  if (renumber)
2605  vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
2606 
2607  if (sw_if_index)
2608  *sw_if_index = sw_if_idx;
2609 
2610  // Process node must connect
2612  return rv;
2613 }
2614 
2615 int
2617  const char *sock_filename,
2618  u8 is_server,
2619  u32 sw_if_index,
2620  u64 feature_mask, u8 renumber, u32 custom_dev_instance)
2621 {
2623  vhost_user_intf_t *vui = NULL;
2624  u32 sw_if_idx = ~0;
2625  int server_sock_fd = -1;
2626  int rv = 0;
2627  vnet_hw_interface_t *hwif;
2628 
2629  if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) ||
2630  hwif->dev_class_index != vhost_user_dev_class.index)
2631  return VNET_API_ERROR_INVALID_SW_IF_INDEX;
2632 
2634 
2635  // First try to open server socket
2636  if (is_server)
2637  if ((rv = vhost_user_init_server_sock (sock_filename,
2638  &server_sock_fd)) != 0)
2639  return rv;
2640 
2641  vhost_user_term_if (vui);
2642  vhost_user_vui_init (vnm, vui, server_sock_fd,
2643  sock_filename, feature_mask, &sw_if_idx);
2644 
2645  if (renumber)
2646  vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
2647 
2648  // Process node must connect
2650  return rv;
2651 }
2652 
2653 clib_error_t *
2655  unformat_input_t * input,
2656  vlib_cli_command_t * cmd)
2657 {
2658  unformat_input_t _line_input, *line_input = &_line_input;
2659  u8 *sock_filename = NULL;
2660  u32 sw_if_index;
2661  u8 is_server = 0;
2662  u64 feature_mask = (u64) ~ (0ULL);
2663  u8 renumber = 0;
2664  u32 custom_dev_instance = ~0;
2665  u8 hwaddr[6];
2666  u8 *hw = NULL;
2667 
2668  /* Get a line of input. */
2669  if (!unformat_user (input, unformat_line_input, line_input))
2670  return 0;
2671 
2672  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
2673  {
2674  if (unformat (line_input, "socket %s", &sock_filename))
2675  ;
2676  else if (unformat (line_input, "server"))
2677  is_server = 1;
2678  else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
2679  ;
2680  else
2681  if (unformat
2682  (line_input, "hwaddr %U", unformat_ethernet_address, hwaddr))
2683  hw = hwaddr;
2684  else if (unformat (line_input, "renumber %d", &custom_dev_instance))
2685  {
2686  renumber = 1;
2687  }
2688  else
2689  return clib_error_return (0, "unknown input `%U'",
2690  format_unformat_error, input);
2691  }
2692  unformat_free (line_input);
2693 
2694  vnet_main_t *vnm = vnet_get_main ();
2695 
2696  int rv;
2697  if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename,
2698  is_server, &sw_if_index, feature_mask,
2699  renumber, custom_dev_instance, hw)))
2700  {
2701  vec_free (sock_filename);
2702  return clib_error_return (0, "vhost_user_create_if returned %d", rv);
2703  }
2704 
2705  vec_free (sock_filename);
2707  sw_if_index);
2708  return 0;
2709 }
2710 
2711 clib_error_t *
2713  unformat_input_t * input,
2714  vlib_cli_command_t * cmd)
2715 {
2716  unformat_input_t _line_input, *line_input = &_line_input;
2717  u32 sw_if_index = ~0;
2718  vnet_main_t *vnm = vnet_get_main ();
2719 
2720  /* Get a line of input. */
2721  if (!unformat_user (input, unformat_line_input, line_input))
2722  return 0;
2723 
2724  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
2725  {
2726  if (unformat (line_input, "sw_if_index %d", &sw_if_index))
2727  ;
2728  else if (unformat
2729  (line_input, "%U", unformat_vnet_sw_interface, vnm,
2730  &sw_if_index))
2731  {
2732  vnet_hw_interface_t *hwif =
2733  vnet_get_sup_hw_interface (vnm, sw_if_index);
2734  if (hwif == NULL ||
2735  vhost_user_dev_class.index != hwif->dev_class_index)
2736  return clib_error_return (0, "Not a vhost interface");
2737  }
2738  else
2739  return clib_error_return (0, "unknown input `%U'",
2740  format_unformat_error, input);
2741  }
2742  unformat_free (line_input);
2743  vhost_user_delete_if (vnm, vm, sw_if_index);
2744  return 0;
2745 }
2746 
2747 int
2749  vhost_user_intf_details_t ** out_vuids)
2750 {
2751  int rv = 0;
2753  vhost_user_intf_t *vui;
2754  vhost_user_intf_details_t *r_vuids = NULL;
2756  u32 *hw_if_indices = 0;
2758  u8 *s = NULL;
2759  int i;
2760 
2761  if (!out_vuids)
2762  return -1;
2763 
2765  vec_add1 (hw_if_indices, vui->hw_if_index);
2766  );
2767 
2768  for (i = 0; i < vec_len (hw_if_indices); i++)
2769  {
2770  hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
2772 
2773  vec_add2 (r_vuids, vuid, 1);
2774  vuid->sw_if_index = vui->sw_if_index;
2775  vuid->virtio_net_hdr_sz = vui->virtio_net_hdr_sz;
2776  vuid->features = vui->features;
2777  vuid->num_regions = vui->nregions;
2778  vuid->sock_errno = vui->sock_errno;
2779  strncpy ((char *) vuid->sock_filename, (char *) vui->sock_filename,
2780  ARRAY_LEN (vuid->sock_filename) - 1);
2781 
2782  s = format (s, "%v%c", hi->name, 0);
2783 
2784  strncpy ((char *) vuid->if_name, (char *) s,
2785  ARRAY_LEN (vuid->if_name) - 1);
2786  _vec_len (s) = 0;
2787  }
2788 
2789  vec_free (s);
2790  vec_free (hw_if_indices);
2791 
2792  *out_vuids = r_vuids;
2793 
2794  return rv;
2795 }
2796 
2797 clib_error_t *
2799  unformat_input_t * input,
2800  vlib_cli_command_t * cmd)
2801 {
2802  clib_error_t *error = 0;
2803  vnet_main_t *vnm = vnet_get_main ();
2805  vhost_user_intf_t *vui;
2806  u32 hw_if_index, *hw_if_indices = 0;
2808  vhost_cpu_t *vhc;
2810  u32 ci;
2811 
2812  int i, j, q;
2813  int show_descr = 0;
2814  struct feat_struct
2815  {
2816  u8 bit;
2817  char *str;
2818  };
2819  struct feat_struct *feat_entry;
2820 
2821  static struct feat_struct feat_array[] = {
2822 #define _(s,b) { .str = #s, .bit = b, },
2824 #undef _
2825  {.str = NULL}
2826  };
2827 
2828 #define foreach_protocol_feature \
2829  _(VHOST_USER_PROTOCOL_F_MQ) \
2830  _(VHOST_USER_PROTOCOL_F_LOG_SHMFD)
2831 
2832  static struct feat_struct proto_feat_array[] = {
2833 #define _(s) { .str = #s, .bit = s},
2835 #undef _
2836  {.str = NULL}
2837  };
2838 
2839  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
2840  {
2841  if (unformat
2842  (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index))
2843  {
2844  vec_add1 (hw_if_indices, hw_if_index);
2845  }
2846  else if (unformat (input, "descriptors") || unformat (input, "desc"))
2847  show_descr = 1;
2848  else
2849  {
2850  error = clib_error_return (0, "unknown input `%U'",
2851  format_unformat_error, input);
2852  goto done;
2853  }
2854  }
2855  if (vec_len (hw_if_indices) == 0)
2856  {
2858  vec_add1 (hw_if_indices, vui->hw_if_index);
2859  );
2860  }
2861  vlib_cli_output (vm, "Virtio vhost-user interfaces");
2862  vlib_cli_output (vm, "Global:\n coalesce frames %d time %e",
2863  vum->coalesce_frames, vum->coalesce_time);
2864 
2865  for (i = 0; i < vec_len (hw_if_indices); i++)
2866  {
2867  hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
2869  vlib_cli_output (vm, "Interface: %s (ifindex %d)",
2870  hi->name, hw_if_indices[i]);
2871 
2872  vlib_cli_output (vm, "virtio_net_hdr_sz %d\n"
2873  " features mask (0x%llx): \n"
2874  " features (0x%llx): \n",
2875  vui->virtio_net_hdr_sz, vui->feature_mask,
2876  vui->features);
2877 
2878  feat_entry = (struct feat_struct *) &feat_array;
2879  while (feat_entry->str)
2880  {
2881  if (vui->features & (1ULL << feat_entry->bit))
2882  vlib_cli_output (vm, " %s (%d)", feat_entry->str,
2883  feat_entry->bit);
2884  feat_entry++;
2885  }
2886 
2887  vlib_cli_output (vm, " protocol features (0x%llx)",
2888  vui->protocol_features);
2889  feat_entry = (struct feat_struct *) &proto_feat_array;
2890  while (feat_entry->str)
2891  {
2892  if (vui->protocol_features & (1ULL << feat_entry->bit))
2893  vlib_cli_output (vm, " %s (%d)", feat_entry->str,
2894  feat_entry->bit);
2895  feat_entry++;
2896  }
2897 
2898  vlib_cli_output (vm, "\n");
2899 
2900  vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n",
2901  vui->sock_filename,
2902  (vui->unix_server_index != ~0) ? "server" : "client",
2903  strerror (vui->sock_errno));
2904 
2905  vlib_cli_output (vm, " rx placement: ");
2906  vec_foreach (vhc, vum->cpus)
2907  {
2908  vec_foreach (vhiq, vhc->rx_queues)
2909  {
2910  if (vhiq->vhost_iface_index == vui - vum->vhost_user_interfaces)
2911  vlib_cli_output (vm, " thread %d on vring %d\n",
2912  vhc - vum->cpus, VHOST_VRING_IDX_TX (vhiq->qid));
2913  }
2914  }
2915 
2916  vlib_cli_output (vm, " tx placement: %s\n",
2917  vui->use_tx_spinlock ? "spin-lock" : "lock-free");
2918 
2920  {
2921  vlib_cli_output (vm, " thread %d on vring %d\n", ci,
2922  VHOST_VRING_IDX_RX (vui->per_cpu_tx_qid[ci]));
2923  }
2924 
2925  vlib_cli_output (vm, "\n");
2926 
2927  vlib_cli_output (vm, " Memory regions (total %d)\n", vui->nregions);
2928 
2929  if (vui->nregions)
2930  {
2931  vlib_cli_output (vm,
2932  " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n");
2933  vlib_cli_output (vm,
2934  " ====== ===== ================== ================== ================== ================== ==================\n");
2935  }
2936  for (j = 0; j < vui->nregions; j++)
2937  {
2938  vlib_cli_output (vm,
2939  " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n",
2940  j, vui->region_mmap_fd[j],
2941  vui->regions[j].guest_phys_addr,
2942  vui->regions[j].memory_size,
2943  vui->regions[j].userspace_addr,
2944  vui->regions[j].mmap_offset,
2946  }
2947  for (q = 0; q < VHOST_VRING_MAX_N; q++)
2948  {
2949  if (!vui->vrings[q].started)
2950  continue;
2951 
2952  vlib_cli_output (vm, "\n Virtqueue %d (%s%s)\n", q,
2953  (q & 1) ? "RX" : "TX",
2954  vui->vrings[q].enabled ? "" : " disabled");
2955 
2956  vlib_cli_output (vm,
2957  " qsz %d last_avail_idx %d last_used_idx %d\n",
2958  vui->vrings[q].qsz, vui->vrings[q].last_avail_idx,
2959  vui->vrings[q].last_used_idx);
2960 
2961  if (vui->vrings[q].avail && vui->vrings[q].used)
2962  vlib_cli_output (vm,
2963  " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
2964  vui->vrings[q].avail->flags,
2965  vui->vrings[q].avail->idx,
2966  vui->vrings[q].used->flags,
2967  vui->vrings[q].used->idx);
2968 
2969  int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
2970  int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
2971  vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n",
2972  kickfd, callfd, vui->vrings[q].errfd);
2973 
2974  if (show_descr)
2975  {
2976  vlib_cli_output (vm, "\n descriptor table:\n");
2977  vlib_cli_output (vm,
2978  " id addr len flags next user_addr\n");
2979  vlib_cli_output (vm,
2980  " ===== ================== ===== ====== ===== ==================\n");
2981  for (j = 0; j < vui->vrings[q].qsz; j++)
2982  {
2983  u32 mem_hint = 0;
2984  vlib_cli_output (vm,
2985  " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
2986  j, vui->vrings[q].desc[j].addr,
2987  vui->vrings[q].desc[j].len,
2988  vui->vrings[q].desc[j].flags,
2989  vui->vrings[q].desc[j].next,
2991  (vui,
2992  vui->vrings[q].desc[j].
2993  addr, &mem_hint)));
2994  }
2995  }
2996  }
2997  vlib_cli_output (vm, "\n");
2998  }
2999 done:
3000  vec_free (hw_if_indices);
3001  return error;
3002 }
3003 
3004 /*
3005  * CLI functions
3006  */
3007 
3008 /*?
3009  * Create a vHost User interface. Once created, a new virtual interface
3010  * will exist with the name '<em>VirtualEthernet0/0/x</em>', where '<em>x</em>'
3011  * is the next free index.
3012  *
3013  * There are several parameters associated with a vHost interface:
3014  *
3015  * - <b>socket <socket-filename></b> - Name of the linux socket used by QEMU/VM and
3016  * VPP to manage the vHost interface. If socket does not already exist, VPP will
3017  * create the socket.
3018  *
3019  * - <b>server</b> - Optional flag to indicate that VPP should be the server for the
3020  * linux socket. If not provided, VPP will be the client.
3021  *
3022  * - <b>feature-mask <hex></b> - Optional virtio/vhost feature set negotiated at
3023  * startup. By default, all supported features will be advertised. Otherwise,
3024  * provide the set of features desired.
3025  * - 0x000008000 (15) - VIRTIO_NET_F_MRG_RXBUF
3026  * - 0x000020000 (17) - VIRTIO_NET_F_CTRL_VQ
3027  * - 0x000200000 (21) - VIRTIO_NET_F_GUEST_ANNOUNCE
3028  * - 0x000400000 (22) - VIRTIO_NET_F_MQ
3029  * - 0x004000000 (26) - VHOST_F_LOG_ALL
3030  * - 0x008000000 (27) - VIRTIO_F_ANY_LAYOUT
3031  * - 0x010000000 (28) - VIRTIO_F_INDIRECT_DESC
3032  * - 0x040000000 (30) - VHOST_USER_F_PROTOCOL_FEATURES
3033  * - 0x100000000 (32) - VIRTIO_F_VERSION_1
3034  *
3035  * - <b>hwaddr <mac-addr></b> - Optional ethernet address, can be in either
3036  * X:X:X:X:X:X unix or X.X.X cisco format.
3037  *
3038  * - <b>renumber <dev_instance></b> - Optional parameter which allows the instance
3039  * in the name to be specified. If instance already exists, name will be used
3040  * anyway and multiple instances will have the same name. Use with caution.
3041  *
3042  * @cliexpar
3043  * Example of how to create a vhost interface with VPP as the client and all features enabled:
3044  * @cliexstart{create vhost-user socket /tmp/vhost1.sock}
3045  * VirtualEthernet0/0/0
3046  * @cliexend
3047  * Example of how to create a vhost interface with VPP as the server and with just
3048  * multiple queues enabled:
3049  * @cliexstart{create vhost-user socket /tmp/vhost2.sock server feature-mask 0x40400000}
3050  * VirtualEthernet0/0/1
3051  * @cliexend
3052  * Once the vHost interface is created, enable the interface using:
3053  * @cliexcmd{set interface state VirtualEthernet0/0/0 up}
3054 ?*/
3055 /* *INDENT-OFF* */
3056 VLIB_CLI_COMMAND (vhost_user_connect_command, static) = {
3057  .path = "create vhost-user",
3058  .short_help = "create vhost-user socket <socket-filename> [server] [feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>]",
3059  .function = vhost_user_connect_command_fn,
3060 };
3061 /* *INDENT-ON* */
3062 
3063 /*?
3064  * Delete a vHost User interface using the interface name or the
3065  * software interface index. Use the '<em>show interfaces</em>'
3066  * command to determine the software interface index. On deletion,
3067  * the linux socket will not be deleted.
3068  *
3069  * @cliexpar
3070  * Example of how to delete a vhost interface by name:
3071  * @cliexcmd{delete vhost-user VirtualEthernet0/0/1}
3072  * Example of how to delete a vhost interface by software interface index:
3073  * @cliexcmd{delete vhost-user sw_if_index 1}
3074 ?*/
3075 /* *INDENT-OFF* */
3076 VLIB_CLI_COMMAND (vhost_user_delete_command, static) = {
3077  .path = "delete vhost-user",
3078  .short_help = "delete vhost-user {<interface> | sw_if_index <sw_idx>}",
3079  .function = vhost_user_delete_command_fn,
3080 };
3081 
3082 /*?
3083  * Display the attributes of a single vHost User interface (provide interface
3084  * name), multiple vHost User interfaces (provide a list of interface names seperated
3085  * by spaces) or all Vhost User interfaces (omit an interface name to display all
3086  * vHost interfaces).
3087  *
3088  * @cliexpar
3089  * @parblock
3090  * Example of how to display a vhost interface:
3091  * @cliexstart{show vhost-user VirtualEthernet0/0/0}
3092  * Virtio vhost-user interfaces
3093  * Global:
3094  * coalesce frames 32 time 1e-3
3095  * Interface: VirtualEthernet0/0/0 (ifindex 1)
3096  * virtio_net_hdr_sz 12
3097  * features mask (0xffffffffffffffff):
3098  * features (0x50408000):
3099  * VIRTIO_NET_F_MRG_RXBUF (15)
3100  * VIRTIO_NET_F_MQ (22)
3101  * VIRTIO_F_INDIRECT_DESC (28)
3102  * VHOST_USER_F_PROTOCOL_FEATURES (30)
3103  * protocol features (0x3)
3104  * VHOST_USER_PROTOCOL_F_MQ (0)
3105  * VHOST_USER_PROTOCOL_F_LOG_SHMFD (1)
3106  *
3107  * socket filename /tmp/vhost1.sock type client errno "Success"
3108  *
3109  * rx placement:
3110  * thread 1 on vring 1
3111  * thread 1 on vring 5
3112  * thread 2 on vring 3
3113  * thread 2 on vring 7
3114  * tx placement: spin-lock
3115  * thread 0 on vring 0
3116  * thread 1 on vring 2
3117  * thread 2 on vring 0
3118  *
3119  * Memory regions (total 2)
3120  * region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr
3121  * ====== ===== ================== ================== ================== ================== ==================
3122  * 0 60 0x0000000000000000 0x00000000000a0000 0x00002aaaaac00000 0x0000000000000000 0x00002aab2b400000
3123  * 1 61 0x00000000000c0000 0x000000003ff40000 0x00002aaaaacc0000 0x00000000000c0000 0x00002aababcc0000
3124  *
3125  * Virtqueue 0 (TX)
3126  * qsz 256 last_avail_idx 0 last_used_idx 0
3127  * avail.flags 1 avail.idx 128 used.flags 1 used.idx 0
3128  * kickfd 62 callfd 64 errfd -1
3129  *
3130  * Virtqueue 1 (RX)
3131  * qsz 256 last_avail_idx 0 last_used_idx 0
3132  * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
3133  * kickfd 65 callfd 66 errfd -1
3134  *
3135  * Virtqueue 2 (TX)
3136  * qsz 256 last_avail_idx 0 last_used_idx 0
3137  * avail.flags 1 avail.idx 128 used.flags 1 used.idx 0
3138  * kickfd 63 callfd 70 errfd -1
3139  *
3140  * Virtqueue 3 (RX)
3141  * qsz 256 last_avail_idx 0 last_used_idx 0
3142  * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
3143  * kickfd 72 callfd 74 errfd -1
3144  *
3145  * Virtqueue 4 (TX disabled)
3146  * qsz 256 last_avail_idx 0 last_used_idx 0
3147  * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
3148  * kickfd 76 callfd 78 errfd -1
3149  *
3150  * Virtqueue 5 (RX disabled)
3151  * qsz 256 last_avail_idx 0 last_used_idx 0
3152  * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
3153  * kickfd 80 callfd 82 errfd -1
3154  *
3155  * Virtqueue 6 (TX disabled)
3156  * qsz 256 last_avail_idx 0 last_used_idx 0
3157  * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
3158  * kickfd 84 callfd 86 errfd -1
3159  *
3160  * Virtqueue 7 (RX disabled)
3161  * qsz 256 last_avail_idx 0 last_used_idx 0
3162  * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
3163  * kickfd 88 callfd 90 errfd -1
3164  *
3165  * @cliexend
3166  *
3167  * The optional '<em>descriptors</em>' parameter will display the same output as
3168  * the previous example but will include the descriptor table for each queue.
3169  * The output is truncated below:
3170  * @cliexstart{show vhost-user VirtualEthernet0/0/0 descriptors}
3171  * Virtio vhost-user interfaces
3172  * Global:
3173  * coalesce frames 32 time 1e-3
3174  * Interface: VirtualEthernet0/0/0 (ifindex 1)
3175  * virtio_net_hdr_sz 12
3176  * features mask (0xffffffffffffffff):
3177  * features (0x50408000):
3178  * VIRTIO_NET_F_MRG_RXBUF (15)
3179  * VIRTIO_NET_F_MQ (22)
3180  * :
3181  * Virtqueue 0 (TX)
3182  * qsz 256 last_avail_idx 0 last_used_idx 0
3183  * avail.flags 1 avail.idx 128 used.flags 1 used.idx 0
3184  * kickfd 62 callfd 64 errfd -1
3185  *
3186  * descriptor table:
3187  * id addr len flags next user_addr
3188  * ===== ================== ===== ====== ===== ==================
3189  * 0 0x0000000010b6e974 2060 0x0002 1 0x00002aabbc76e974
3190  * 1 0x0000000010b6e034 2060 0x0002 2 0x00002aabbc76e034
3191  * 2 0x0000000010b6d6f4 2060 0x0002 3 0x00002aabbc76d6f4
3192  * 3 0x0000000010b6cdb4 2060 0x0002 4 0x00002aabbc76cdb4
3193  * 4 0x0000000010b6c474 2060 0x0002 5 0x00002aabbc76c474
3194  * 5 0x0000000010b6bb34 2060 0x0002 6 0x00002aabbc76bb34
3195  * 6 0x0000000010b6b1f4 2060 0x0002 7 0x00002aabbc76b1f4
3196  * 7 0x0000000010b6a8b4 2060 0x0002 8 0x00002aabbc76a8b4
3197  * 8 0x0000000010b69f74 2060 0x0002 9 0x00002aabbc769f74
3198  * 9 0x0000000010b69634 2060 0x0002 10 0x00002aabbc769634
3199  * 10 0x0000000010b68cf4 2060 0x0002 11 0x00002aabbc768cf4
3200  * :
3201  * 249 0x0000000000000000 0 0x0000 250 0x00002aab2b400000
3202  * 250 0x0000000000000000 0 0x0000 251 0x00002aab2b400000
3203  * 251 0x0000000000000000 0 0x0000 252 0x00002aab2b400000
3204  * 252 0x0000000000000000 0 0x0000 253 0x00002aab2b400000
3205  * 253 0x0000000000000000 0 0x0000 254 0x00002aab2b400000
3206  * 254 0x0000000000000000 0 0x0000 255 0x00002aab2b400000
3207  * 255 0x0000000000000000 0 0x0000 32768 0x00002aab2b400000
3208  *
3209  * Virtqueue 1 (RX)
3210  * qsz 256 last_avail_idx 0 last_used_idx 0
3211  * :
3212  * @cliexend
3213  * @endparblock
3214 ?*/
3215 /* *INDENT-OFF* */
3216 VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
3217  .path = "show vhost-user",
3218  .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]",
3219  .function = show_vhost_user_command_fn,
3220 };
3221 /* *INDENT-ON* */
3222 
3223 static clib_error_t *
3225 {
3227 
3228  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3229  {
3230  if (unformat (input, "coalesce-frames %d", &vum->coalesce_frames))
3231  ;
3232  else if (unformat (input, "coalesce-time %f", &vum->coalesce_time))
3233  ;
3234  else if (unformat (input, "dont-dump-memory"))
3235  vum->dont_dump_vhost_user_memory = 1;
3236  else
3237  return clib_error_return (0, "unknown input `%U'",
3238  format_unformat_error, input);
3239  }
3240 
3241  return 0;
3242 }
3243 
3244 /* vhost-user { ... } configuration. */
3245 VLIB_CONFIG_FUNCTION (vhost_user_config, "vhost-user");
3246 
3247 void
3249 {
3251  vhost_user_intf_t *vui;
3252 
3253  if (vum->dont_dump_vhost_user_memory)
3254  {
3256  unmap_all_mem_regions (vui);
3257  );
3258  }
3259 }
3260 
3261 static clib_error_t *
3263  unformat_input_t * input, vlib_cli_command_t * cmd)
3264 {
3265  unformat_input_t _line_input, *line_input = &_line_input;
3266  u32 worker_thread_index;
3267  u32 sw_if_index;
3268  u8 del = 0;
3269  int rv;
3270 
3271  /* Get a line of input. */
3272  if (!unformat_user (input, unformat_line_input, line_input))
3273  return 0;
3274 
3275  if (!unformat
3276  (line_input, "%U %d", unformat_vnet_sw_interface, vnet_get_main (),
3277  &sw_if_index, &worker_thread_index))
3278  {
3279  unformat_free (line_input);
3280  return clib_error_return (0, "unknown input `%U'",
3281  format_unformat_error, input);
3282  }
3283 
3284  if (unformat (line_input, "del"))
3285  del = 1;
3286 
3287  if ((rv =
3288  vhost_user_thread_placement (sw_if_index, worker_thread_index, del)))
3289  return clib_error_return (0, "vhost_user_thread_placement returned %d",
3290  rv);
3291  return 0;
3292 }
3293 
3294 
3295 /*?
3296  * This command is used to move the RX processing for the given
3297  * interfaces to the provided thread. If the '<em>del</em>' option is used,
3298  * the forced thread assignment is removed and the thread assigment is
3299  * reassigned automatically. Use '<em>show vhost-user <interface></em>'
3300  * to see the thread assignment.
3301  *
3302  * @cliexpar
3303  * Example of how to move the RX processing for a given interface to a given thread:
3304  * @cliexcmd{vhost thread VirtualEthernet0/0/0 1}
3305  * Example of how to remove the forced thread assignment for a given interface:
3306  * @cliexcmd{vhost thread VirtualEthernet0/0/0 1 del}
3307 ?*/
3308 /* *INDENT-OFF* */
3309 VLIB_CLI_COMMAND (vhost_user_thread_command, static) = {
3310  .path = "vhost thread",
3311  .short_help = "vhost thread <iface> <worker-index> [del]",
3312  .function = vhost_thread_command_fn,
3313 };
3314 /* *INDENT-ON* */
3315 
3316 /*
3317  * fd.io coding-style-patch-verification: ON
3318  *
3319  * Local Variables:
3320  * eval: (c-set-style "gnu")
3321  * End:
3322  */
unformat_function_t unformat_vnet_hw_interface
#define vec_validate(V, I)
Make sure vector is long enough for given index (no header, unspecified alignment) ...
Definition: vec.h:396
static clib_error_t * vhost_user_init(vlib_main_t *vm)
Definition: vhost-user.c:1208
unix_file_t * file_pool
Definition: unix.h:89
void vlib_put_next_frame(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, u32 n_vectors_left)
Release pointer to next frame vector data.
Definition: main.c:459
static void vlib_increment_simple_counter(vlib_simple_counter_main_t *cm, u32 cpu_index, u32 index, u32 increment)
Increment a simple counter.
Definition: counter.h:78
static void vhost_user_vring_close(vhost_user_intf_t *vui, u32 qid)
Definition: vhost-user.c:568
vmrglw vmrglh hi
static void vhost_user_if_disconnect(vhost_user_intf_t *vui)
Definition: vhost-user.c:591
#define vec_foreach_index(var, v)
Iterate over vector indices.
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:343
vring_desc_t * desc
Definition: vhost-user.h:197
#define CLIB_UNUSED(x)
Definition: clib.h:79
uword unformat(unformat_input_t *i, char *fmt,...)
Definition: unformat.c:966
u32 virtio_ring_flags
The device index.
Definition: vhost-user.h:271
virtio_net_hdr_mrg_rxbuf_t hdr
Length of the first data descriptor.
Definition: vhost-user.h:273
static uword random_default_seed(void)
Default random seed (unix/linux user-mode)
Definition: random.h:91
clib_error_t * vnet_hw_interface_set_flags(vnet_main_t *vnm, u32 hw_if_index, u32 flags)
Definition: interface.c:531
static u32 vlib_get_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt)
Definition: trace_funcs.h:143
static f64 vlib_process_wait_for_event_or_clock(vlib_main_t *vm, f64 dt)
Suspend a cooperative multi-tasking thread Waits for an event, or for the indicated number of seconds...
Definition: node_funcs.h:684
unix_file_function_t * read_function
Definition: unix.h:62
vhost_cpu_t * cpus
Per-CPU data for vhost-user.
Definition: vhost-user.h:310
static void vhost_user_create_ethernet(vnet_main_t *vnm, vlib_main_t *vm, vhost_user_intf_t *vui, u8 *hwaddress)
Create ethernet interface for vhost user interface.
Definition: vhost-user.c:2486
#define VHOST_USER_DOWN_DISCARD_COUNT
Definition: vhost-user.c:72
u8 runtime_data[0]
Definition: node.h:469
void ethernet_delete_interface(vnet_main_t *vnm, u32 hw_if_index)
Definition: interface.c:275
#define VHOST_VRING_IDX_TX(qid)
Definition: vhost-user.h:24
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
static vnet_hw_interface_t * vnet_get_sup_hw_interface(vnet_main_t *vnm, u32 sw_if_index)
static clib_error_t * vhost_user_socket_error(unix_file_t *uf)
Definition: vhost-user.c:1164
void vhost_user_rx_trace(vhost_trace_t *t, vhost_user_intf_t *vui, u16 qid, vlib_buffer_t *b, vhost_user_vring_t *txvq)
Definition: vhost-user.c:1300
u64 region_guest_addr_hi[VHOST_MEMORY_MAX_NREGIONS]
Definition: vhost-user.h:233
vnet_interface_main_t interface_main
Definition: vnet.h:57
#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b)
Definition: buffer.h:406
#define PREDICT_TRUE(x)
Definition: clib.h:98
static void vlib_error_count(vlib_main_t *vm, uword node_index, uword counter, uword increment)
Definition: error_funcs.h:55
#define UNFORMAT_END_OF_INPUT
Definition: format.h:143
#define NULL
Definition: clib.h:55
u32 vlib_buffer_alloc_from_free_list(vlib_main_t *vm, u32 *buffers, u32 n_buffers, u32 free_list_index)
Allocate buffers from specific freelist into supplied array.
static f64 vlib_time_now(vlib_main_t *vm)
Definition: main.h:182
#define foreach_virtio_trace_flags
Definition: vhost-user.c:95
vhost_copy_t copy[VHOST_USER_COPY_ARRAY_N]
Definition: vhost-user.h:287
static void vhost_user_term_if(vhost_user_intf_t *vui)
Disables and reset interface structure.
Definition: vhost-user.c:2391
vring_avail_t * avail
Definition: vhost-user.h:198
static uword vhost_user_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Definition: vhost-user.c:1825
static vnet_hw_interface_t * vnet_get_hw_interface(vnet_main_t *vnm, u32 hw_if_index)
static_always_inline void vnet_feature_start_device_input_x1(u32 sw_if_index, u32 *next0, vlib_buffer_t *b0, u16 buffer_advanced0)
Definition: feature.h:229
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:482
int vnet_interface_name_renumber(u32 sw_if_index, u32 new_show_dev_instance)
Definition: interface.c:1246
struct _vlib_node_registration vlib_node_registration_t
static_always_inline u32 vhost_user_input_copy(vhost_user_intf_t *vui, vhost_copy_t *cpy, u16 copy_len, u32 *map_hint)
Definition: vhost-user.c:1360
#define VHOST_USER_MSG_HDR_SZ
Definition: vhost-user.h:20
static clib_error_t * vhost_user_interface_admin_up_down(vnet_main_t *vnm, u32 hw_if_index, u32 flags)
Definition: vhost-user.c:2260
#define vec_add2(V, P, N)
Add N elements to end of vector V, return pointer to new elements in P.
Definition: vec.h:521
static vnet_sw_interface_t * vnet_get_sw_interface(vnet_main_t *vnm, u32 sw_if_index)
static clib_error_t * vhost_thread_command_fn(vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
Definition: vhost-user.c:3262
clib_error_t * show_vhost_user_command_fn(vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
Definition: vhost-user.c:2798
unformat_function_t unformat_vnet_sw_interface
#define clib_error_report(e)
Definition: error.h:125
#define VNET_HW_INTERFACE_FLAG_LINK_UP
Definition: interface.h:377
static char * vhost_user_input_func_error_strings[]
Definition: vhost-user.c:149
static char * vhost_user_tx_func_error_strings[]
Definition: vhost-user.c:127
#define pool_get(P, E)
Allocate an object E from a pool P (unspecified alignment).
Definition: pool.h:200
vring_used_t * used
Definition: vhost-user.h:199
format_function_t format_vnet_sw_if_index_name
vhost_trace_t * current_trace
Definition: vhost-user.h:291
static uword unix_file_add(unix_main_t *um, unix_file_t *template)
Definition: unix.h:136
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
static int vhost_user_name_renumber(vnet_hw_interface_t *hi, u32 new_dev_instance)
Definition: vhost-user.c:183
static void vhost_user_vui_init(vnet_main_t *vnm, vhost_user_intf_t *vui, int server_sock_fd, const char *sock_filename, u64 feature_mask, u32 *sw_if_index)
Definition: vhost-user.c:2524
static vnet_sw_interface_t * vnet_get_hw_sw_interface(vnet_main_t *vnm, u32 hw_if_index)
static void vlib_trace_buffer(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, vlib_buffer_t *b, int follow_chain)
Definition: trace_funcs.h:104
#define VHOST_VRING_F_LOG
Definition: vhost-user.h:32
vnet_main_t * vnet_get_main(void)
Definition: misc.c:46
VNET_DEVICE_CLASS(vhost_user_dev_class, static)
static u8 * format_vhost_user_interface_name(u8 *s, va_list *args)
Definition: vhost-user.c:166
#define static_always_inline
Definition: clib.h:85
#define pool_foreach(VAR, POOL, BODY)
Iterate through pool.
Definition: pool.h:348
#define vlib_prefetch_buffer_with_index(vm, bi, type)
Prefetch buffer metadata by buffer index The first 64 bytes of buffer contains most header informatio...
Definition: buffer_funcs.h:170
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:111
static uword vlib_process_get_events(vlib_main_t *vm, uword **data_vector)
Return the first event type which has occurred and a vector of per-event data of that type...
Definition: node_funcs.h:527
static clib_error_t * ip4_init(vlib_main_t *vm)
Definition: ip4_input.c:464
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:194
vlib_combined_counter_main_t * combined_sw_if_counters
Definition: interface.h:615
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:113
void * log_base_addr
Definition: vhost-user.h:243
#define foreach_protocol_feature
#define vec_elt_at_index(v, i)
Get vector value at index i checking that i is in bounds.
static void unformat_free(unformat_input_t *i)
Definition: format.h:161
vhost_user_tx_func_error_t
Definition: vhost-user.c:119
#define clib_warning(format, args...)
Definition: error.h:59
unsigned long u64
Definition: types.h:89
static void unmap_all_mem_regions(vhost_user_intf_t *vui)
Definition: vhost-user.c:293
uword unformat_user(unformat_input_t *input, unformat_function_t *func,...)
Definition: unformat.c:977
vhost_iface_and_queue_t * rx_queues
Definition: vhost-user.h:282
vhost_user_input_func_error_t
Definition: vhost-user.c:141
#define vlib_call_init_function(vm, x)
Definition: init.h:161
static clib_error_t * vhost_user_socket_read(unix_file_t *uf)
Definition: vhost-user.c:655
static uword pointer_to_uword(const void *p)
Definition: types.h:131
#define VLIB_BUFFER_NEXT_PRESENT
Definition: buffer.h:97
#define UNIX_GET_FD(unixfd_idx)
Definition: vhost-user.c:90
#define VLIB_BUFFER_PRE_DATA_SIZE
Definition: buffer.h:52
static int vhost_user_init_server_sock(const char *sock_filename, int *sock_fd)
Open server unix socket on specified sock_filename.
Definition: vhost-user.c:2446
static void unix_file_del(unix_main_t *um, unix_file_t *f)
Definition: unix.h:146
VLIB_DEVICE_TX_FUNCTION_MULTIARCH(vhost_user_dev_class, vhost_user_tx)
Definition: vhost-user.c:2290
static void vhost_user_vring_unlock(vhost_user_intf_t *vui, u32 qid)
Unlock the vring lock.
Definition: vhost-user.c:540
format_function_t format_vnet_sw_interface_name
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:369
static uword format_get_indent(u8 *s)
Definition: format.h:72
u32 file_descriptor
Definition: unix.h:52
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:82
static void vlib_process_signal_event(vlib_main_t *vm, uword node_index, uword type_opaque, uword data)
Definition: node_funcs.h:931
uword private_data
Definition: unix.h:59
int vhost_user_delete_if(vnet_main_t *vnm, vlib_main_t *vm, u32 sw_if_index)
Definition: vhost-user.c:2410
static void * map_user_mem(vhost_user_intf_t *vui, uword addr)
Definition: vhost-user.c:268
u32 random
Pseudo random iterator.
Definition: vhost-user.h:313
static_always_inline void vhost_user_log_dirty_pages(vhost_user_intf_t *vui, u64 addr, u64 len)
Definition: vhost-user.c:643
uword os_get_cpu_number(void)
Definition: unix-misc.c:224
#define VIRTQ_DESC_F_INDIRECT
Definition: vhost-user.h:27
#define clib_error_return_unix(e, args...)
Definition: error.h:114
#define pool_put(P, E)
Free an object E in pool P.
Definition: pool.h:214
void vhost_user_tx_trace(vhost_trace_t *t, vhost_user_intf_t *vui, u16 qid, vlib_buffer_t *b, vhost_user_vring_t *rxvq)
Definition: vhost-user.c:1866
#define PREDICT_FALSE(x)
Definition: clib.h:97
#define VLIB_CONFIG_FUNCTION(x, n,...)
Definition: init.h:118
#define vhost_user_log_dirty_ring(vui, vq, member)
Definition: vhost-user.c:648
static vlib_node_registration_t vhost_user_process_node
(constructor) VLIB_REGISTER_NODE (vhost_user_process_node)
Definition: vhost-user.c:2379
void vhost_user_unmap_all(void)
Definition: vhost-user.c:3248
#define vec_del1(v, i)
Delete the element at index I.
Definition: vec.h:765
char sock_filename[256]
Definition: vhost-user.h:219
vnet_main_t vnet_main
Definition: misc.c:43
#define VLIB_FRAME_SIZE
Definition: node.h:328
vlib_simple_counter_main_t * sw_if_counters
Definition: interface.h:614
u32 region_mmap_fd[VHOST_MEMORY_MAX_NREGIONS]
Definition: vhost-user.h:234
static void vhost_user_send_call(vlib_main_t *vm, vhost_user_vring_t *vq)
Definition: vhost-user.c:1347
vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS]
Definition: vhost-user.h:230
#define vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, bi0, next0)
Finish enqueueing one buffer forward in the graph.
Definition: buffer_node.h:216
#define vlib_get_next_frame(vm, node, next_index, vectors, n_vectors_left)
Get pointer to next frame vector data by (vlib_node_runtime_t, next_index).
Definition: node_funcs.h:350
void vlib_cli_output(vlib_main_t *vm, char *fmt,...)
Definition: cli.c:576
int vhost_user_dump_ifs(vnet_main_t *vnm, vlib_main_t *vm, vhost_user_intf_details_t **out_vuids)
Definition: vhost-user.c:2748
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:121
static clib_error_t * vhost_user_exit(vlib_main_t *vm)
Definition: vhost-user.c:1253
static void vhost_user_tx_thread_placement(vhost_user_intf_t *vui)
Definition: vhost-user.c:329
static void vhost_user_vring_init(vhost_user_intf_t *vui, u32 qid)
Definition: vhost-user.c:546
u8 * format_ethernet_header_with_length(u8 *s, va_list *args)
Definition: format.c:115
u32 * show_dev_instance_by_real_dev_instance
Definition: vhost-user.h:298
int vhost_user_create_if(vnet_main_t *vnm, vlib_main_t *vm, const char *sock_filename, u8 is_server, u32 *sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, u8 *hwaddr)
Definition: vhost-user.c:2577
u16 device_index
The interface queue index (Not the virtio vring idx)
Definition: vhost-user.h:270
vhost_user_intf_t * vhost_user_interfaces
Definition: vhost-user.h:297
u16 n_vectors
Definition: node.h:344
static clib_error_t * vhost_user_kickfd_read_ready(unix_file_t *uf)
Definition: vhost-user.c:498
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:82
static_always_inline void vhost_user_log_dirty_pages_2(vhost_user_intf_t *vui, u64 addr, u64 len, u8 is_host_address)
Definition: vhost-user.c:615
#define vec_free(V)
Free vector&#39;s memory (no header).
Definition: vec.h:300
static vlib_thread_main_t * vlib_get_thread_main()
Definition: global_funcs.h:32
static int vhost_user_vring_try_lock(vhost_user_intf_t *vui, u32 qid)
Try once to lock the vring.
Definition: vhost-user.c:521
#define VLIB_MAIN_LOOP_EXIT_FUNCTION(x)
Definition: init.h:115
int vhost_user_modify_if(vnet_main_t *vnm, vlib_main_t *vm, const char *sock_filename, u8 is_server, u32 sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance)
Definition: vhost-user.c:2616
#define clib_memcpy(a, b, c)
Definition: string.h:69
#define VHOST_MEMORY_MAX_NREGIONS
Definition: vhost-user.h:19
static_always_inline void * map_guest_mem(vhost_user_intf_t *vui, uword addr, u32 *hint)
Definition: vhost-user.c:200
#define VLIB_BUFFER_TOTAL_LENGTH_VALID
Definition: buffer.h:99
u32 nregions
Definition: vhost-user.h:74
vlib_main_t vlib_global_main
Definition: main.c:1562
void vlib_worker_thread_barrier_sync(vlib_main_t *vm)
Definition: threads.c:1163
#define ARRAY_LEN(x)
Definition: clib.h:59
static uword vhost_user_tx(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
Definition: vhost-user.c:1950
u16 first_desc_len
Runtime queue flags.
Definition: vhost-user.h:272
#define VHOST_USER_PROTOCOL_F_LOG_SHMFD
Definition: vhost-user.h:31
static void vhost_user_input_rewind_buffers(vlib_main_t *vm, vhost_cpu_t *cpu, vlib_buffer_t *b_head)
Definition: vhost-user.c:1447
#define VLIB_CLI_COMMAND(x,...)
Definition: cli.h:154
#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX
Definition: buffer.h:310
#define VNET_SW_INTERFACE_FLAG_ADMIN_UP
Definition: interface.h:528
u32 max_l3_packet_bytes[VLIB_N_RX_TX]
Definition: interface.h:449
u32 rx_buffers[VHOST_USER_RX_BUFFERS_N]
Definition: vhost-user.h:284
uword unformat_ethernet_address(unformat_input_t *input, va_list *args)
Definition: format.c:245
static void vlib_increment_combined_counter(vlib_combined_counter_main_t *cm, u32 cpu_index, u32 index, u32 packet_increment, u32 byte_increment)
Increment a combined counter.
Definition: counter.h:241
#define ASSERT(truth)
#define VHOST_USER_RX_BUFFER_STARVATION
Definition: vhost-user.c:78
unsigned int u32
Definition: types.h:88
u8 * format_unformat_error(u8 *s, va_list *va)
Definition: unformat.c:91
#define vnet_buffer(b)
Definition: buffer.h:361
void vlib_buffer_free(vlib_main_t *vm, u32 *buffers, u32 n_buffers)
Free buffers Frees the entire buffer chain for each buffer.
static long get_huge_page_size(int fd)
Definition: vhost-user.c:285
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:117
clib_error_t * vhost_user_delete_command_fn(vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
Definition: vhost-user.c:2712
static int vhost_user_thread_placement(u32 sw_if_index, u32 worker_thread_index, u8 del)
Definition: vhost-user.c:417
#define VIRTQ_DESC_F_NEXT
Definition: vhost-user.h:26
volatile u32 * vring_locks[VHOST_VRING_MAX_N]
Definition: vhost-user.h:238
clib_error_t * ethernet_register_interface(vnet_main_t *vnm, u32 dev_class_index, u32 dev_instance, u8 *address, u32 *hw_if_index_return, ethernet_flag_change_function_t flag_change)
Definition: interface.c:226
virtio_trace_flag_t
Definition: vhost-user.c:101
static void * vlib_frame_args(vlib_frame_t *f)
Get pointer to frame scalar data.
Definition: node_funcs.h:270
static void vlib_node_set_state(vlib_main_t *vm, u32 node_index, vlib_node_state_t new_state)
Set node dispatch state.
Definition: node_funcs.h:146
uword * thread_registrations_by_name
Definition: threads.h:274
unix_main_t unix_main
Definition: main.c:57
static u8 * format_vhost_trace(u8 *s, va_list *va)
Definition: vhost-user.c:1262
#define VHOST_USER_RX_COPY_THRESHOLD
Definition: vhost-user.c:88
#define VLIB_BUFFER_IS_TRACED
Definition: buffer.h:95
static void vhost_user_vring_lock(vhost_user_intf_t *vui, u32 qid)
Spin until the vring is successfully locked.
Definition: vhost-user.c:530
static void vhost_user_rx_thread_placement()
Definition: vhost-user.c:364
u64 uword
Definition: types.h:112
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
static void vhost_user_update_iface_state(vhost_user_intf_t *vui)
Definition: vhost-user.c:471
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:112
#define foreach_vhost_user_tx_func_error
Definition: vhost-user.c:110
void * region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS]
Definition: vhost-user.h:231
Definition: defs.h:47
unsigned short u16
Definition: types.h:57
static clib_error_t * vhost_user_socksvr_accept_ready(unix_file_t *uf)
Definition: vhost-user.c:1180
static clib_error_t * vhost_user_config(vlib_main_t *vm, unformat_input_t *input)
Definition: vhost-user.c:3224
u32 input_cpu_count
total cpu count
Definition: vhost-user.h:307
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
double f64
Definition: types.h:142
#define VRING_USED_F_NO_NOTIFY
Definition: vhost-user.h:44
#define VHOST_USER_RX_BUFFERS_N
Definition: vhost-user.h:277
unsigned char u8
Definition: types.h:56
int vhost_user_intf_ready(vhost_user_intf_t *vui)
Returns whether at least one TX and one RX vring are enabled.
Definition: vhost-user.c:459
vhost_user_vring_t vrings[VHOST_VRING_MAX_N]
Definition: vhost-user.h:237
u32 input_cpu_first_index
first cpu index
Definition: vhost-user.h:304
#define VHOST_VRING_MAX_N
Definition: vhost-user.h:22
vlib_main_t ** vlib_mains
Definition: unix.h:49
vlib_node_registration_t vhost_user_input_node
(constructor) VLIB_REGISTER_NODE (vhost_user_input_node)
Definition: vhost-user.c:108
#define VLIB_BUFFER_DATA_SIZE
Definition: buffer.h:51
u32 rx_buffers_len
Definition: vhost-user.h:283
#define DBG_SOCK(args...)
Definition: vhost-user.c:58
#define hash_get_mem(h, key)
Definition: hash.h:268
u32 vhost_user_rx_discard_packet(vlib_main_t *vm, vhost_user_intf_t *vui, vhost_user_vring_t *txvq, u32 discard_max)
Try to discard packets from the tx ring (VPP RX path).
Definition: vhost-user.c:1408
static vhost_user_main_t vhost_user_main
Definition: vhost-user.c:156
static void * clib_mem_alloc_aligned(uword size, uword align)
Definition: mem.h:117
#define DBG_VQ(args...)
Definition: vhost-user.c:64
#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn)
Definition: node.h:158
static uword unformat_check_input(unformat_input_t *i)
Definition: format.h:169
static u32 random_u32(u32 *seed)
32-bit random number generator
Definition: random.h:69
void vlib_worker_thread_barrier_release(vlib_main_t *vm)
Definition: threads.c:1195
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:143
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:418
u64 region_guest_addr_lo[VHOST_MEMORY_MAX_NREGIONS]
Definition: vhost-user.h:232
static clib_error_t * vhost_user_callfd_read_ready(unix_file_t *uf)
Definition: vhost-user.c:489
static u32 vhost_user_if_input(vlib_main_t *vm, vhost_user_main_t *vum, vhost_user_intf_t *vui, u16 qid, vlib_node_runtime_t *node)
Definition: vhost-user.c:1465
#define vec_foreach(var, vec)
Vector iterator.
#define foreach_vhost_user_input_func_error
Definition: vhost-user.c:133
#define CLIB_MEMORY_BARRIER()
Definition: clib.h:101
vhost_vring_addr_t addr
Definition: vhost-user.h:81
virtio_net_hdr_mrg_rxbuf_t tx_headers[VLIB_FRAME_SIZE]
Definition: vhost-user.h:286
#define clib_error_return(e, args...)
Definition: error.h:111
struct _unformat_input_t unformat_input_t
static void vlib_set_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt, u32 count)
Definition: trace_funcs.h:159
u32 flags
Definition: vhost-user.h:75
#define vec_validate_init_empty(V, I, INIT)
Make sure vector is long enough for given index and initialize empty space (no header, unspecified alignment)
Definition: vec.h:445
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:67
u32 flags
buffer flags: VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:85
static_always_inline u32 vhost_user_tx_copy(vhost_user_intf_t *vui, vhost_copy_t *cpy, u16 copy_len, u32 *map_hint)
Definition: vhost-user.c:1902
unformat_function_t unformat_line_input
Definition: format.h:281
#define VHOST_USER_PROTOCOL_F_MQ
Definition: vhost-user.h:30
#define VHOST_LOG_PAGE
Definition: vhost-user.c:613
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:57
clib_error_t * vhost_user_connect_command_fn(vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
Definition: vhost-user.c:2654
VNET_HW_INTERFACE_CLASS(vhost_interface_class, static)
Definition: defs.h:46
int dont_dump_vhost_user_memory
Definition: vhost-user.h:301
#define VHOST_VRING_IDX_RX(qid)
Definition: vhost-user.h:23