FD.io VPP  v20.01-48-g3e0dafb74
Vector Packet Processing
tcp_input.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2019 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <vppinfra/sparse_vec.h>
17 #include <vnet/fib/ip4_fib.h>
18 #include <vnet/fib/ip6_fib.h>
19 #include <vnet/tcp/tcp_packet.h>
20 #include <vnet/tcp/tcp.h>
21 #include <vnet/session/session.h>
22 #include <math.h>
23 
24 static char *tcp_error_strings[] = {
25 #define tcp_error(n,s) s,
26 #include <vnet/tcp/tcp_error.def>
27 #undef tcp_error
28 };
29 
30 /* All TCP nodes have the same outgoing arcs */
31 #define foreach_tcp_state_next \
32  _ (DROP4, "ip4-drop") \
33  _ (DROP6, "ip6-drop") \
34  _ (TCP4_OUTPUT, "tcp4-output") \
35  _ (TCP6_OUTPUT, "tcp6-output")
36 
37 typedef enum _tcp_established_next
38 {
39 #define _(s,n) TCP_ESTABLISHED_NEXT_##s,
41 #undef _
44 
45 typedef enum _tcp_rcv_process_next
46 {
47 #define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
49 #undef _
52 
53 typedef enum _tcp_syn_sent_next
54 {
55 #define _(s,n) TCP_SYN_SENT_NEXT_##s,
57 #undef _
60 
61 typedef enum _tcp_listen_next
62 {
63 #define _(s,n) TCP_LISTEN_NEXT_##s,
65 #undef _
68 
69 /* Generic, state independent indices */
70 typedef enum _tcp_state_next
71 {
72 #define _(s,n) TCP_NEXT_##s,
74 #undef _
77 
78 #define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
79  : TCP_NEXT_TCP6_OUTPUT)
80 
81 #define tcp_next_drop(is_ip4) (is_ip4 ? TCP_NEXT_DROP4 \
82  : TCP_NEXT_DROP6)
83 
84 /**
85  * Validate segment sequence number. As per RFC793:
86  *
87  * Segment Receive Test
88  * Length Window
89  * ------- ------- -------------------------------------------
90  * 0 0 SEG.SEQ = RCV.NXT
91  * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
92  * >0 0 not acceptable
93  * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
94  * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
95  *
96  * This ultimately consists in checking if segment falls within the window.
97  * The one important difference compared to RFC793 is that we use rcv_las,
98  * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
99  * peer's reference when computing our receive window.
100  *
101  * This:
102  * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
103  * however, is too strict when we have retransmits. Instead we just check that
104  * the seq is not beyond the right edge and that the end of the segment is not
105  * less than the left edge.
106  *
107  * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
108  * use rcv_nxt in the right edge window test instead of rcv_las.
109  *
110  */
113 {
114  return (seq_geq (end_seq, tc->rcv_las)
115  && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
116 }
117 
118 /**
119  * Parse TCP header options.
120  *
121  * @param th TCP header
122  * @param to TCP options data structure to be populated
123  * @param is_syn set if packet is syn
124  * @return -1 if parsing failed
125  */
126 static inline int
128 {
129  const u8 *data;
130  u8 opt_len, opts_len, kind;
131  int j;
132  sack_block_t b;
133 
134  opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
135  data = (const u8 *) (th + 1);
136 
137  /* Zero out all flags but those set in SYN */
138  to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
139  | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
140 
141  for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
142  {
143  kind = data[0];
144 
145  /* Get options length */
146  if (kind == TCP_OPTION_EOL)
147  break;
148  else if (kind == TCP_OPTION_NOOP)
149  {
150  opt_len = 1;
151  continue;
152  }
153  else
154  {
155  /* broken options */
156  if (opts_len < 2)
157  return -1;
158  opt_len = data[1];
159 
160  /* weird option length */
161  if (opt_len < 2 || opt_len > opts_len)
162  return -1;
163  }
164 
165  /* Parse options */
166  switch (kind)
167  {
168  case TCP_OPTION_MSS:
169  if (!is_syn)
170  break;
171  if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
172  {
173  to->flags |= TCP_OPTS_FLAG_MSS;
174  to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
175  }
176  break;
178  if (!is_syn)
179  break;
180  if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
181  {
182  to->flags |= TCP_OPTS_FLAG_WSCALE;
183  to->wscale = data[2];
184  if (to->wscale > TCP_MAX_WND_SCALE)
186  }
187  break;
189  if (is_syn)
190  to->flags |= TCP_OPTS_FLAG_TSTAMP;
191  if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
192  && opt_len == TCP_OPTION_LEN_TIMESTAMP)
193  {
194  to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
195  to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
196  }
197  break;
199  if (!is_syn)
200  break;
201  if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
202  to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
203  break;
205  /* If SACK permitted was not advertised or a SYN, break */
206  if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
207  break;
208 
209  /* If too short or not correctly formatted, break */
210  if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
211  break;
212 
213  to->flags |= TCP_OPTS_FLAG_SACK;
214  to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
215  vec_reset_length (to->sacks);
216  for (j = 0; j < to->n_sack_blocks; j++)
217  {
218  b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
219  b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
220  vec_add1 (to->sacks, b);
221  }
222  break;
223  default:
224  /* Nothing to see here */
225  continue;
226  }
227  }
228  return 0;
229 }
230 
231 /**
232  * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
233  * timestamp to echo and it's less than tsval_recent, drop segment
234  * but still send an ACK in order to retain TCP's mechanism for detecting
235  * and recovering from half-open connections
236  *
237  * Or at least that's what the theory says. It seems that this might not work
238  * very well with packet reordering and fast retransmit. XXX
239  */
240 always_inline int
242 {
243  return tcp_opts_tstamp (&tc->rcv_opts)
244  && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
245 }
246 
247 /**
248  * Update tsval recent
249  */
250 always_inline void
252 {
253  /*
254  * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
255  * of an incoming segment:
256  * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
257  * then the TSval from the segment is copied to TS.Recent;
258  * otherwise, the TSval is ignored.
259  */
260  if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las)
261  && seq_leq (tc->rcv_las, seq_end))
262  {
263  ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
264  tc->tsval_recent = tc->rcv_opts.tsval;
265  tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
266  }
267 }
268 
269 /**
270  * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
271  *
272  * It first verifies if segment has a wrapped sequence number (PAWS) and then
273  * does the processing associated to the first four steps (ignoring security
274  * and precedence): sequence number, rst bit and syn bit checks.
275  *
276  * @return 0 if segments passes validation.
277  */
278 static int
280  vlib_buffer_t * b0, tcp_header_t * th0, u32 * error0)
281 {
282  /* We could get a burst of RSTs interleaved with acks */
283  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
284  {
285  tcp_send_reset (tc0);
286  *error0 = TCP_ERROR_CONNECTION_CLOSED;
287  goto error;
288  }
289 
290  if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
291  {
292  *error0 = TCP_ERROR_SEGMENT_INVALID;
293  goto error;
294  }
295 
296  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts, 0)))
297  {
298  *error0 = TCP_ERROR_OPTIONS;
299  goto error;
300  }
301 
303  {
304  *error0 = TCP_ERROR_PAWS;
305  TCP_EVT (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
306  vnet_buffer (b0)->tcp.seq_end);
307 
308  /* If it just so happens that a segment updates tsval_recent for a
309  * segment over 24 days old, invalidate tsval_recent. */
310  if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
311  tcp_time_now_w_thread (tc0->c_thread_index)))
312  {
313  tc0->tsval_recent = tc0->rcv_opts.tsval;
314  clib_warning ("paws failed: 24-day old segment");
315  }
316  /* Drop after ack if not rst. Resets can fail paws check as per
317  * RFC 7323 sec. 5.2: When an <RST> segment is received, it MUST NOT
318  * be subjected to the PAWS check by verifying an acceptable value in
319  * SEG.TSval */
320  else if (!tcp_rst (th0))
321  {
322  tcp_program_ack (tc0);
323  TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
324  goto error;
325  }
326  }
327 
328  /* 1st: check sequence number */
329  if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
330  vnet_buffer (b0)->tcp.seq_end))
331  {
332  /* SYN/SYN-ACK retransmit */
333  if (tcp_syn (th0)
334  && vnet_buffer (b0)->tcp.seq_number == tc0->rcv_nxt - 1)
335  {
336  tcp_options_parse (th0, &tc0->rcv_opts, 1);
337  if (tc0->state == TCP_STATE_SYN_RCVD)
338  {
339  tcp_send_synack (tc0);
340  TCP_EVT (TCP_EVT_SYN_RCVD, tc0, 0);
341  *error0 = TCP_ERROR_SYNS_RCVD;
342  }
343  else
344  {
345  tcp_program_ack (tc0);
346  TCP_EVT (TCP_EVT_SYNACK_RCVD, tc0);
347  *error0 = TCP_ERROR_SYN_ACKS_RCVD;
348  }
349  goto error;
350  }
351 
352  /* If our window is 0 and the packet is in sequence, let it pass
353  * through for ack processing. It should be dropped later. */
354  if (tc0->rcv_wnd < tc0->snd_mss
355  && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
356  goto check_reset;
357 
358  /* If we entered recovery and peer did so as well, there's a chance that
359  * dup acks won't be acceptable on either end because seq_end may be less
360  * than rcv_las. This can happen if acks are lost in both directions. */
361  if (tcp_in_recovery (tc0)
362  && seq_geq (vnet_buffer (b0)->tcp.seq_number,
363  tc0->rcv_las - tc0->rcv_wnd)
364  && seq_leq (vnet_buffer (b0)->tcp.seq_end,
365  tc0->rcv_nxt + tc0->rcv_wnd))
366  goto check_reset;
367 
368  *error0 = TCP_ERROR_RCV_WND;
369 
370  /* If we advertised a zero rcv_wnd and the segment is in the past or the
371  * next one that we expect, it is probably a window probe */
372  if ((tc0->flags & TCP_CONN_ZERO_RWND_SENT)
373  && seq_lt (vnet_buffer (b0)->tcp.seq_end,
374  tc0->rcv_las + tc0->rcv_opts.mss))
375  *error0 = TCP_ERROR_ZERO_RWND;
376 
377  tc0->errors.below_data_wnd += seq_lt (vnet_buffer (b0)->tcp.seq_end,
378  tc0->rcv_las);
379 
380  /* If not RST, send dup ack */
381  if (!tcp_rst (th0))
382  {
383  tcp_program_dupack (tc0);
384  TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
385  }
386  goto error;
387 
388  check_reset:
389  ;
390  }
391 
392  /* 2nd: check the RST bit */
393  if (PREDICT_FALSE (tcp_rst (th0)))
394  {
395  tcp_connection_reset (tc0);
396  *error0 = TCP_ERROR_RST_RCVD;
397  goto error;
398  }
399 
400  /* 3rd: check security and precedence (skip) */
401 
402  /* 4th: check the SYN bit (in window) */
403  if (PREDICT_FALSE (tcp_syn (th0)))
404  {
405  /* As per RFC5961 send challenge ack instead of reset */
406  tcp_program_ack (tc0);
407  *error0 = TCP_ERROR_SPURIOUS_SYN;
408  goto error;
409  }
410 
411  /* If segment in window, save timestamp */
412  tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
413  vnet_buffer (b0)->tcp.seq_end);
414  return 0;
415 
416 error:
417  return -1;
418 }
419 
420 always_inline int
422 {
423  /* SND.UNA =< SEG.ACK =< SND.NXT */
424  if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number)
425  && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
426  {
427  if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
428  && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
429  {
430  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
431  goto acceptable;
432  }
433  *error = TCP_ERROR_ACK_INVALID;
434  return -1;
435  }
436 
437 acceptable:
438  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
439  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
440  *error = TCP_ERROR_ACK_OK;
441  return 0;
442 }
443 
444 /**
445  * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
446  *
447  * Note that although the original article, srtt and rttvar are scaled
448  * to minimize round-off errors, here we don't. Instead, we rely on
449  * better precision time measurements.
450  *
451  * TODO support us rtt resolution
452  */
453 static void
455 {
456  int err, diff;
457 
458  if (tc->srtt != 0)
459  {
460  err = mrtt - tc->srtt;
461 
462  /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
463  * The increase should be bound */
464  tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
465  diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
466  tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
467  }
468  else
469  {
470  /* First measurement. */
471  tc->srtt = mrtt;
472  tc->rttvar = mrtt >> 1;
473  }
474 }
475 
476 #ifndef CLIB_MARCH_VARIANT
477 void
479 {
480  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
481  tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
482 }
483 #endif /* CLIB_MARCH_VARIANT */
484 
485 /**
486  * Update RTT estimate and RTO timer
487  *
488  * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
489  * timing. Middle boxes are known to fiddle with TCP options so we
490  * should give higher priority to ACK timing.
491  *
492  * This should be called only if previously sent bytes have been acked.
493  *
494  * return 1 if valid rtt 0 otherwise
495  */
496 static int
498 {
499  u32 mrtt = 0;
500 
501  /* Karn's rule, part 1. Don't use retransmitted segments to estimate
502  * RTT because they're ambiguous. */
503  if (tcp_in_cong_recovery (tc))
504  {
505  /* Accept rtt estimates for samples that have not been retransmitted */
506  if ((tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
507  && !(rs->flags & TCP_BTS_IS_RXT))
508  {
509  mrtt = rs->rtt_time * THZ;
510  goto estimate_rtt;
511  }
512  goto done;
513  }
514 
515  if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
516  {
517  f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
518  tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125;
519  mrtt = clib_max ((u32) (sample * THZ), 1);
520  /* Allow measuring of a new RTT */
521  tc->rtt_ts = 0;
522  }
523  /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
524  * snd_una, i.e., the left side of the send window:
525  * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
526  else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
527  {
528  u32 now = tcp_tstamp (tc);
529  mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
530  }
531 
532 estimate_rtt:
533 
534  /* Ignore dubious measurements */
535  if (mrtt == 0 || mrtt > TCP_RTT_MAX)
536  goto done;
537 
538  tcp_estimate_rtt (tc, mrtt);
539 
540 done:
541 
542  /* If we got here something must've been ACKed so make sure boff is 0,
543  * even if mrtt is not valid since we update the rto lower */
544  tc->rto_boff = 0;
545  tcp_update_rto (tc);
546 
547  return 0;
548 }
549 
550 static void
552 {
553  u8 thread_index = vlib_num_workers ()? 1 : 0;
554  int mrtt;
555 
556  if (tc->rtt_ts)
557  {
558  tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
559  tc->mrtt_us = clib_max (tc->mrtt_us, 0.0001);
560  mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
561  tc->rtt_ts = 0;
562  }
563  else
564  {
565  mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
566  mrtt = clib_max (mrtt, 1);
567  /* Due to retransmits we don't know the initial mrtt */
568  if (tc->rto_boff && mrtt > 1 * THZ)
569  mrtt = 1 * THZ;
570  tc->mrtt_us = (f64) mrtt *TCP_TICK;
571  }
572 
573  if (mrtt > 0 && mrtt < TCP_RTT_MAX)
574  tcp_estimate_rtt (tc, mrtt);
575  tcp_update_rto (tc);
576 }
577 
578 /**
579  * Dequeue bytes for connections that have received acks in last burst
580  */
581 static void
583 {
584  u32 thread_index = wrk->vm->thread_index;
585  u32 *pending_deq_acked;
586  tcp_connection_t *tc;
587  int i;
588 
589  if (!vec_len (wrk->pending_deq_acked))
590  return;
591 
592  pending_deq_acked = wrk->pending_deq_acked;
593  for (i = 0; i < vec_len (pending_deq_acked); i++)
594  {
595  tc = tcp_connection_get (pending_deq_acked[i], thread_index);
596  tc->flags &= ~TCP_CONN_DEQ_PENDING;
597 
598  if (PREDICT_FALSE (!tc->burst_acked))
599  continue;
600 
601  /* Dequeue the newly ACKed bytes */
602  session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
603  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
604 
605  if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
606  {
607  if (seq_leq (tc->psh_seq, tc->snd_una))
608  tc->flags &= ~TCP_CONN_PSH_PENDING;
609  }
610 
611  /* If everything has been acked, stop retransmit timer
612  * otherwise update. */
614 
615  /* Update pacer based on our new cwnd estimate */
617 
618  tc->burst_acked = 0;
619  }
620  _vec_len (wrk->pending_deq_acked) = 0;
621 }
622 
623 static void
625 {
626  if (!(tc->flags & TCP_CONN_DEQ_PENDING))
627  {
628  vec_add1 (wrk->pending_deq_acked, tc->c_c_index);
629  tc->flags |= TCP_CONN_DEQ_PENDING;
630  }
631  tc->burst_acked += tc->bytes_acked;
632 }
633 
634 #ifndef CLIB_MARCH_VARIANT
635 static u32
637 {
638  ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes));
639  return hole - sb->holes;
640 }
641 
642 static u32
644 {
645  return hole->end - hole->start;
646 }
647 
650 {
651  if (index != TCP_INVALID_SACK_HOLE_INDEX)
652  return pool_elt_at_index (sb->holes, index);
653  return 0;
654 }
655 
658 {
659  if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
660  return pool_elt_at_index (sb->holes, hole->next);
661  return 0;
662 }
663 
666 {
667  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
668  return pool_elt_at_index (sb->holes, hole->prev);
669  return 0;
670 }
671 
674 {
675  if (sb->head != TCP_INVALID_SACK_HOLE_INDEX)
676  return pool_elt_at_index (sb->holes, sb->head);
677  return 0;
678 }
679 
682 {
683  if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX)
684  return pool_elt_at_index (sb->holes, sb->tail);
685  return 0;
686 }
687 
688 static void
690 {
691  sack_scoreboard_hole_t *next, *prev;
692 
693  if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
694  {
695  next = pool_elt_at_index (sb->holes, hole->next);
696  next->prev = hole->prev;
697  }
698  else
699  {
700  sb->tail = hole->prev;
701  }
702 
703  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
704  {
705  prev = pool_elt_at_index (sb->holes, hole->prev);
706  prev->next = hole->next;
707  }
708  else
709  {
710  sb->head = hole->next;
711  }
712 
713  if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
714  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
715 
716  /* Poison the entry */
717  if (CLIB_DEBUG > 0)
718  clib_memset (hole, 0xfe, sizeof (*hole));
719 
720  pool_put (sb->holes, hole);
721 }
722 
723 static sack_scoreboard_hole_t *
725  u32 start, u32 end)
726 {
727  sack_scoreboard_hole_t *hole, *next, *prev;
728  u32 hole_index;
729 
730  pool_get (sb->holes, hole);
731  clib_memset (hole, 0, sizeof (*hole));
732 
733  hole->start = start;
734  hole->end = end;
735  hole_index = scoreboard_hole_index (sb, hole);
736 
737  prev = scoreboard_get_hole (sb, prev_index);
738  if (prev)
739  {
740  hole->prev = prev_index;
741  hole->next = prev->next;
742 
743  if ((next = scoreboard_next_hole (sb, hole)))
744  next->prev = hole_index;
745  else
746  sb->tail = hole_index;
747 
748  prev->next = hole_index;
749  }
750  else
751  {
752  sb->head = hole_index;
753  hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
754  hole->next = TCP_INVALID_SACK_HOLE_INDEX;
755  }
756 
757  return hole;
758 }
759 
760 always_inline void
762  u8 has_rxt)
763 {
764  if (!has_rxt || seq_geq (start, sb->high_rxt))
765  return;
766 
767  sb->rxt_sacked +=
768  seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start);
769 }
770 
771 always_inline void
773 {
775  u32 sacked = 0, blks = 0, old_sacked;
776 
777  old_sacked = sb->sacked_bytes;
778 
779  sb->last_lost_bytes = 0;
780  sb->lost_bytes = 0;
781  sb->sacked_bytes = 0;
782 
783  right = scoreboard_last_hole (sb);
784  if (!right)
785  {
786  sb->sacked_bytes = sb->high_sacked - ack;
787  sb->last_sacked_bytes = sb->sacked_bytes
788  - (old_sacked - sb->last_bytes_delivered);
789  return;
790  }
791 
792  if (seq_gt (sb->high_sacked, right->end))
793  {
794  sacked = sb->high_sacked - right->end;
795  blks = 1;
796  }
797 
798  while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss
799  && blks < TCP_DUPACK_THRESHOLD)
800  {
801  if (right->is_lost)
802  sb->lost_bytes += scoreboard_hole_bytes (right);
803 
804  left = scoreboard_prev_hole (sb, right);
805  if (!left)
806  {
807  ASSERT (right->start == ack || sb->is_reneging);
808  sacked += right->start - ack;
809  right = 0;
810  break;
811  }
812 
813  sacked += right->start - left->end;
814  blks++;
815  right = left;
816  }
817 
818  /* right is first lost */
819  while (right)
820  {
821  sb->lost_bytes += scoreboard_hole_bytes (right);
822  sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start);
823  right->is_lost = 1;
824  left = scoreboard_prev_hole (sb, right);
825  if (!left)
826  {
827  ASSERT (right->start == ack || sb->is_reneging);
828  sacked += right->start - ack;
829  break;
830  }
831  sacked += right->start - left->end;
832  right = left;
833  }
834 
835  sb->sacked_bytes = sacked;
836  sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered);
837 }
838 
839 /**
840  * Figure out the next hole to retransmit
841  *
842  * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
843  */
846  sack_scoreboard_hole_t * start,
847  u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
848 {
849  sack_scoreboard_hole_t *hole = 0;
850 
851  hole = start ? start : scoreboard_first_hole (sb);
852  while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
853  hole = scoreboard_next_hole (sb, hole);
854 
855  /* Nothing, return */
856  if (!hole)
857  {
858  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
859  return 0;
860  }
861 
862  /* Rule (1): if higher than rxt, less than high_sacked and lost */
863  if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
864  {
865  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
866  }
867  else
868  {
869  /* Rule (2): available unsent data */
870  if (have_unsent)
871  {
872  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
873  return 0;
874  }
875  /* Rule (3): if hole not lost */
876  else if (seq_lt (hole->start, sb->high_sacked))
877  {
878  /* And we didn't already retransmit it */
879  if (seq_leq (hole->end, sb->high_rxt))
880  {
881  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
882  return 0;
883  }
884  *snd_limited = 0;
885  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
886  }
887  /* Rule (4): if hole beyond high_sacked */
888  else
889  {
890  ASSERT (seq_geq (hole->start, sb->high_sacked));
891  *snd_limited = 1;
892  *can_rescue = 1;
893  /* HighRxt MUST NOT be updated */
894  return 0;
895  }
896  }
897 
898  if (hole && seq_lt (sb->high_rxt, hole->start))
899  sb->high_rxt = hole->start;
900 
901  return hole;
902 }
903 
904 void
906 {
908  hole = scoreboard_first_hole (sb);
909  if (hole)
910  {
911  snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
912  sb->cur_rxt_hole = sb->head;
913  }
914  sb->high_rxt = snd_una;
915  sb->rescue_rxt = snd_una - 1;
916 }
917 
918 void
920 {
921  sb->head = TCP_INVALID_SACK_HOLE_INDEX;
922  sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
923  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
924 }
925 
926 void
928 {
930  while ((hole = scoreboard_first_hole (sb)))
931  {
932  scoreboard_remove_hole (sb, hole);
933  }
934  ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
935  ASSERT (pool_elts (sb->holes) == 0);
936  sb->sacked_bytes = 0;
937  sb->last_sacked_bytes = 0;
938  sb->last_bytes_delivered = 0;
939  sb->lost_bytes = 0;
940  sb->last_lost_bytes = 0;
941  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
942  sb->is_reneging = 0;
943 }
944 
945 void
947 {
948  sack_scoreboard_hole_t *last_hole;
949 
950  clib_warning ("sack reneging");
951 
952  scoreboard_clear (sb);
954  start, end);
955  last_hole->is_lost = 1;
956  sb->tail = scoreboard_hole_index (sb, last_hole);
957  sb->high_sacked = start;
958  scoreboard_init_rxt (sb, start);
959 }
960 
961 #endif /* CLIB_MARCH_VARIANT */
962 
963 /**
964  * Test that scoreboard is sane after recovery
965  *
966  * Returns 1 if scoreboard is empty or if first hole beyond
967  * snd_una.
968  */
969 static u8
971 {
973  hole = scoreboard_first_hole (&tc->sack_sb);
974  return (!hole || (seq_geq (hole->start, tc->snd_una)
975  && seq_lt (hole->end, tc->snd_nxt)));
976 }
977 
978 #ifndef CLIB_MARCH_VARIANT
979 
980 void
982 {
983  sack_scoreboard_hole_t *hole, *next_hole;
984  sack_scoreboard_t *sb = &tc->sack_sb;
985  sack_block_t *blk, *rcv_sacks;
986  u32 blk_index = 0, i, j;
987  u8 has_rxt;
988 
989  sb->last_sacked_bytes = 0;
990  sb->last_bytes_delivered = 0;
991  sb->rxt_sacked = 0;
992 
993  if (!tcp_opts_sack (&tc->rcv_opts) && !sb->sacked_bytes
994  && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
995  return;
996 
997  has_rxt = tcp_in_cong_recovery (tc);
998 
999  /* Remove invalid blocks */
1000  blk = tc->rcv_opts.sacks;
1001  while (blk < vec_end (tc->rcv_opts.sacks))
1002  {
1003  if (seq_lt (blk->start, blk->end)
1004  && seq_gt (blk->start, tc->snd_una)
1005  && seq_gt (blk->start, ack)
1006  && seq_lt (blk->start, tc->snd_nxt)
1007  && seq_leq (blk->end, tc->snd_nxt))
1008  {
1009  blk++;
1010  continue;
1011  }
1012  vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
1013  }
1014 
1015  /* Add block for cumulative ack */
1016  if (seq_gt (ack, tc->snd_una))
1017  {
1018  vec_add2 (tc->rcv_opts.sacks, blk, 1);
1019  blk->start = tc->snd_una;
1020  blk->end = ack;
1021  }
1022 
1023  if (vec_len (tc->rcv_opts.sacks) == 0)
1024  return;
1025 
1026  tcp_scoreboard_trace_add (tc, ack);
1027 
1028  /* Make sure blocks are ordered */
1029  rcv_sacks = tc->rcv_opts.sacks;
1030  for (i = 0; i < vec_len (rcv_sacks); i++)
1031  for (j = i + 1; j < vec_len (rcv_sacks); j++)
1032  if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start))
1033  {
1034  sack_block_t tmp = rcv_sacks[i];
1035  rcv_sacks[i] = rcv_sacks[j];
1036  rcv_sacks[j] = tmp;
1037  }
1038 
1039  if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
1040  {
1041  /* Handle reneging as a special case */
1042  if (PREDICT_FALSE (sb->is_reneging))
1043  {
1044  /* No holes, only sacked bytes */
1045  if (seq_leq (tc->snd_nxt, sb->high_sacked))
1046  {
1047  /* No progress made so return */
1048  if (seq_leq (ack, tc->snd_una))
1049  return;
1050 
1051  /* Update sacked bytes delivered and return */
1052  sb->last_bytes_delivered = ack - tc->snd_una;
1053  sb->sacked_bytes -= sb->last_bytes_delivered;
1054  sb->is_reneging = seq_lt (ack, sb->high_sacked);
1055  return;
1056  }
1057 
1058  /* New hole above high sacked. Add it and process normally */
1060  sb->high_sacked, tc->snd_nxt);
1061  sb->tail = scoreboard_hole_index (sb, hole);
1062  }
1063  /* Not reneging and no holes. Insert the first that covers all
1064  * outstanding bytes */
1065  else
1066  {
1068  tc->snd_una, tc->snd_nxt);
1069  sb->tail = scoreboard_hole_index (sb, hole);
1070  }
1071  sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end;
1072  }
1073  else
1074  {
1075  /* If we have holes but snd_nxt is beyond the last hole, update
1076  * last hole end or add new hole after high sacked */
1077  hole = scoreboard_last_hole (sb);
1078  if (seq_gt (tc->snd_nxt, hole->end))
1079  {
1080  if (seq_geq (hole->start, sb->high_sacked))
1081  {
1082  hole->end = tc->snd_nxt;
1083  }
1084  /* New hole after high sacked block */
1085  else if (seq_lt (sb->high_sacked, tc->snd_nxt))
1086  {
1087  scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
1088  tc->snd_nxt);
1089  }
1090  }
1091 
1092  /* Keep track of max byte sacked for when the last hole
1093  * is acked */
1094  sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end,
1095  sb->high_sacked);
1096  }
1097 
1098  /* Walk the holes with the SACK blocks */
1099  hole = pool_elt_at_index (sb->holes, sb->head);
1100 
1101  if (PREDICT_FALSE (sb->is_reneging))
1102  {
1103  sb->last_bytes_delivered += clib_min (hole->start - tc->snd_una,
1104  ack - tc->snd_una);
1105  sb->is_reneging = seq_lt (ack, hole->start);
1106  }
1107 
1108  while (hole && blk_index < vec_len (rcv_sacks))
1109  {
1110  blk = &rcv_sacks[blk_index];
1111  if (seq_leq (blk->start, hole->start))
1112  {
1113  /* Block covers hole. Remove hole */
1114  if (seq_geq (blk->end, hole->end))
1115  {
1116  next_hole = scoreboard_next_hole (sb, hole);
1117 
1118  /* If covered by ack, compute delivered bytes */
1119  if (blk->end == ack)
1120  {
1121  u32 sacked = next_hole ? next_hole->start : sb->high_sacked;
1122  if (PREDICT_FALSE (seq_lt (ack, sacked)))
1123  {
1124  sb->last_bytes_delivered += ack - hole->end;
1125  sb->is_reneging = 1;
1126  }
1127  else
1128  {
1129  sb->last_bytes_delivered += sacked - hole->end;
1130  sb->is_reneging = 0;
1131  }
1132  }
1133  scoreboard_update_sacked_rxt (sb, hole->start, hole->end,
1134  has_rxt);
1135  scoreboard_remove_hole (sb, hole);
1136  hole = next_hole;
1137  }
1138  /* Partial 'head' overlap */
1139  else
1140  {
1141  if (seq_gt (blk->end, hole->start))
1142  {
1143  scoreboard_update_sacked_rxt (sb, hole->start, blk->end,
1144  has_rxt);
1145  hole->start = blk->end;
1146  }
1147  blk_index++;
1148  }
1149  }
1150  else
1151  {
1152  /* Hole must be split */
1153  if (seq_lt (blk->end, hole->end))
1154  {
1155  u32 hole_index = scoreboard_hole_index (sb, hole);
1156  next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
1157  hole->end);
1158  /* Pool might've moved */
1159  hole = scoreboard_get_hole (sb, hole_index);
1160  hole->end = blk->start;
1161 
1162  scoreboard_update_sacked_rxt (sb, blk->start, blk->end,
1163  has_rxt);
1164 
1165  blk_index++;
1166  ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
1167  }
1168  else if (seq_lt (blk->start, hole->end))
1169  {
1170  scoreboard_update_sacked_rxt (sb, blk->start, hole->end,
1171  has_rxt);
1172  hole->end = blk->start;
1173  }
1174  hole = scoreboard_next_hole (sb, hole);
1175  }
1176  }
1177 
1178  scoreboard_update_bytes (sb, ack, tc->snd_mss);
1179 
1180  ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
1181  ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
1182  || sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack));
1183  ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt
1184  - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
1186  || sb->is_reneging || sb->holes[sb->head].start == ack);
1187  ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
1188  ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes
1189  - sb->last_bytes_delivered >= sb->rxt_sacked);
1190  ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered
1191  || (tc->flags & TCP_CONN_FINSNT));
1192 
1193  TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc);
1194 }
1195 #endif /* CLIB_MARCH_VARIANT */
1196 
1197 /**
1198  * Try to update snd_wnd based on feedback received from peer.
1199  *
1200  * If successful, and new window is 'effectively' 0, activate persist
1201  * timer.
1202  */
1203 static void
1204 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
1205 {
1206  /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
1207  * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
1208  if (seq_lt (tc->snd_wl1, seq)
1209  || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
1210  {
1211  tc->snd_wnd = snd_wnd;
1212  tc->snd_wl1 = seq;
1213  tc->snd_wl2 = ack;
1214  TCP_EVT (TCP_EVT_SND_WND, tc);
1215 
1216  if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
1217  {
1218  /* Set persist timer if not set and we just got 0 wnd */
1219  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
1220  && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
1221  tcp_persist_timer_set (tc);
1222  }
1223  else
1224  {
1226  if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
1227  {
1228  tc->rto_boff = 0;
1229  tcp_update_rto (tc);
1230  }
1231  }
1232  }
1233 }
1234 
1235 /**
1236  * Init loss recovery/fast recovery.
1237  *
1238  * Triggered by dup acks as opposed to timer timeout. Note that cwnd is
1239  * updated in @ref tcp_cc_handle_event after fast retransmit
1240  */
1241 static void
1243 {
1244  tcp_fastrecovery_on (tc);
1245  tc->snd_congestion = tc->snd_nxt;
1246  tc->cwnd_acc_bytes = 0;
1247  tc->snd_rxt_bytes = 0;
1248  tc->rxt_delivered = 0;
1249  tc->prr_delivered = 0;
1250  tc->prr_start = tc->snd_una;
1251  tc->prev_ssthresh = tc->ssthresh;
1252  tc->prev_cwnd = tc->cwnd;
1253 
1254  tc->snd_rxt_ts = tcp_tstamp (tc);
1255  tcp_cc_congestion (tc);
1256 
1257  /* Post retransmit update cwnd to ssthresh and account for the
1258  * three segments that have left the network and should've been
1259  * buffered at the receiver XXX */
1260  if (!tcp_opts_sack_permitted (&tc->rcv_opts))
1261  tc->cwnd += 3 * tc->snd_mss;
1262 
1263  tc->fr_occurences += 1;
1264  TCP_EVT (TCP_EVT_CC_EVT, tc, 4);
1265 }
1266 
1267 static void
1269 {
1270  tc->cwnd = tc->prev_cwnd;
1271  tc->ssthresh = tc->prev_ssthresh;
1272  tcp_cc_undo_recovery (tc);
1273  ASSERT (tc->rto_boff == 0);
1274  TCP_EVT (TCP_EVT_CC_EVT, tc, 5);
1275 }
1276 
1277 static inline u8
1279 {
1280  return (tcp_in_recovery (tc) && tc->rto_boff == 1
1281  && tc->snd_rxt_ts
1282  && tcp_opts_tstamp (&tc->rcv_opts)
1283  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
1284 }
1285 
1286 static inline u8
1288 {
1289  return (tcp_cc_is_spurious_timeout_rxt (tc));
1290 }
1291 
1292 static inline u8
1294 {
1295  return (tc->sack_sb.lost_bytes
1296  || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
1297  < tc->sack_sb.sacked_bytes));
1298 }
1299 
1300 static inline u8
1302 {
1303  if (!has_sack)
1304  {
1305  /* If of of the two conditions lower hold, reset dupacks because
1306  * we're probably after timeout (RFC6582 heuristics).
1307  * If Cumulative ack does not cover more than congestion threshold,
1308  * and:
1309  * 1) The following doesn't hold: The congestion window is greater
1310  * than SMSS bytes and the difference between highest_ack
1311  * and prev_highest_ack is at most 4*SMSS bytes
1312  * 2) Echoed timestamp in the last non-dup ack does not equal the
1313  * stored timestamp
1314  */
1315  if (seq_leq (tc->snd_una, tc->snd_congestion)
1316  && ((!(tc->cwnd > tc->snd_mss
1317  && tc->bytes_acked <= 4 * tc->snd_mss))
1318  || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
1319  {
1320  tc->rcv_dupacks = 0;
1321  return 0;
1322  }
1323  }
1324  return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1325  || tcp_should_fastrecover_sack (tc));
1326 }
1327 
1328 static int
1330 {
1331  sack_scoreboard_hole_t *hole;
1332  u8 is_spurious = 0;
1333 
1335 
1337  {
1339  is_spurious = 1;
1340  }
1341 
1342  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
1343  tc->rcv_dupacks = 0;
1344 
1345  /* Previous recovery left us congested. Continue sending as part
1346  * of the current recovery event with an updated snd_congestion */
1347  if (tc->sack_sb.sacked_bytes)
1348  {
1349  tc->snd_congestion = tc->snd_nxt;
1351  return is_spurious;
1352  }
1353 
1354  tc->rxt_delivered = 0;
1355  tc->snd_rxt_bytes = 0;
1356  tc->snd_rxt_ts = 0;
1357  tc->prr_delivered = 0;
1358  tc->rtt_ts = 0;
1359  tc->flags &= ~TCP_CONN_RXT_PENDING;
1360 
1361  hole = scoreboard_first_hole (&tc->sack_sb);
1362  if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
1363  scoreboard_clear (&tc->sack_sb);
1364 
1365  if (!tcp_in_recovery (tc) && !is_spurious)
1366  tcp_cc_recovered (tc);
1367 
1368  tcp_fastrecovery_off (tc);
1370  tcp_recovery_off (tc);
1371  TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
1372 
1373  ASSERT (tc->rto_boff == 0);
1374  ASSERT (!tcp_in_cong_recovery (tc));
1376  return is_spurious;
1377 }
1378 
1379 static void
1381 {
1383 
1384  /* Congestion avoidance */
1385  tcp_cc_rcv_ack (tc, rs);
1386 
1387  /* If a cumulative ack, make sure dupacks is 0 */
1388  tc->rcv_dupacks = 0;
1389 
1390  /* When dupacks hits the threshold we only enter fast retransmit if
1391  * cumulative ack covers more than snd_congestion. Should snd_una
1392  * wrap this test may fail under otherwise valid circumstances.
1393  * Therefore, proactively update snd_congestion when wrap detected. */
1394  if (PREDICT_FALSE
1395  (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
1396  && seq_gt (tc->snd_congestion, tc->snd_una)))
1397  tc->snd_congestion = tc->snd_una - 1;
1398 }
1399 
1400 /**
1401  * One function to rule them all ... and in the darkness bind them
1402  */
1403 static void
1405  u32 is_dack)
1406 {
1407  u8 has_sack = tcp_opts_sack_permitted (&tc->rcv_opts);
1408 
1409  /* If reneging, wait for timer based retransmits */
1410  if (PREDICT_FALSE (tcp_is_lost_fin (tc) || tc->sack_sb.is_reneging))
1411  return;
1412 
1413  /*
1414  * If not in recovery, figure out if we should enter
1415  */
1416  if (!tcp_in_cong_recovery (tc))
1417  {
1418  ASSERT (is_dack);
1419 
1420  tc->rcv_dupacks++;
1421  TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
1423 
1424  if (tcp_should_fastrecover (tc, has_sack))
1425  {
1427 
1428  if (has_sack)
1429  scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
1430 
1431  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
1433  }
1434 
1435  return;
1436  }
1437 
1438  /*
1439  * Already in recovery
1440  */
1441 
1442  /*
1443  * Process (re)transmit feedback. Output path uses this to decide how much
1444  * more data to release into the network
1445  */
1446  if (has_sack)
1447  {
1448  if (!tc->bytes_acked && tc->sack_sb.rxt_sacked)
1450 
1451  tc->rxt_delivered += tc->sack_sb.rxt_sacked;
1452  tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes
1453  - tc->sack_sb.last_bytes_delivered;
1454 
1456  }
1457  else
1458  {
1459  if (is_dack)
1460  {
1461  tc->rcv_dupacks += 1;
1462  TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
1463  }
1464  tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked,
1465  tc->snd_rxt_bytes);
1466  if (is_dack)
1467  tc->prr_delivered += clib_min (tc->snd_mss,
1468  tc->snd_nxt - tc->snd_una);
1469  else
1470  tc->prr_delivered += tc->bytes_acked - clib_min (tc->bytes_acked,
1471  tc->snd_mss *
1472  tc->rcv_dupacks);
1473 
1474  /* If partial ack, assume that the first un-acked segment was lost */
1475  if (tc->bytes_acked || tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1477 
1479  }
1480 
1481  /*
1482  * See if we can exit and stop retransmitting
1483  */
1484  if (seq_geq (tc->snd_una, tc->snd_congestion))
1485  {
1486  /* If spurious return, we've already updated everything */
1487  if (tcp_cc_recover (tc))
1488  {
1489  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1490  return;
1491  }
1492 
1493  /* Treat as congestion avoidance ack */
1494  tcp_cc_rcv_ack (tc, rs);
1495  return;
1496  }
1497 
1498  /*
1499  * Notify cc of the event
1500  */
1501 
1502  if (!tc->bytes_acked)
1503  {
1505  return;
1506  }
1507 
1508  /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
1509  * reset dupacks to 0. Also needed if in congestion recovery */
1510  tc->rcv_dupacks = 0;
1511 
1512  if (tcp_in_recovery (tc))
1513  tcp_cc_rcv_ack (tc, rs);
1514  else
1516 }
1517 
1518 static void
1520 {
1521  if (!tcp_in_cong_recovery (tc))
1522  return;
1523 
1524  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1525  tcp_rcv_sacks (tc, tc->snd_una);
1526 
1527  tc->bytes_acked = 0;
1528 
1529  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1530  tcp_bt_sample_delivery_rate (tc, rs);
1531 
1532  tcp_cc_handle_event (tc, rs, 1);
1533 }
1534 
1535 /**
1536  * Check if duplicate ack as per RFC5681 Sec. 2
1537  */
1540  u32 prev_snd_una)
1541 {
1542  return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
1543  && seq_gt (tc->snd_nxt, tc->snd_una)
1544  && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
1545  && (prev_snd_wnd == tc->snd_wnd));
1546 }
1547 
1548 /**
1549  * Checks if ack is a congestion control event.
1550  */
1551 static u8
1553  u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
1554 {
1555  /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
1556  * defined to be 'duplicate' as well */
1557  *is_dack = tc->sack_sb.last_sacked_bytes
1558  || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
1559 
1560  return (*is_dack || tcp_in_cong_recovery (tc));
1561 }
1562 
1563 /**
1564  * Process incoming ACK
1565  */
1566 static int
1568  tcp_header_t * th, u32 * error)
1569 {
1570  u32 prev_snd_wnd, prev_snd_una;
1571  tcp_rate_sample_t rs = { 0 };
1572  u8 is_dack;
1573 
1574  TCP_EVT (TCP_EVT_CC_STAT, tc);
1575 
1576  /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
1577  if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
1578  {
1579  /* We've probably entered recovery and the peer still has some
1580  * of the data we've sent. Update snd_nxt and accept the ack */
1581  if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
1582  && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
1583  {
1584  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
1585  goto process_ack;
1586  }
1587 
1588  tc->errors.above_ack_wnd += 1;
1589  *error = TCP_ERROR_ACK_FUTURE;
1590  TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number);
1591  return -1;
1592  }
1593 
1594  /* If old ACK, probably it's an old dupack */
1595  if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
1596  {
1597  tc->errors.below_ack_wnd += 1;
1598  *error = TCP_ERROR_ACK_OLD;
1599  TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number);
1600 
1601  if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una - tc->rcv_wnd))
1602  return -1;
1603 
1604  tcp_handle_old_ack (tc, &rs);
1605 
1606  /* Don't drop yet */
1607  return 0;
1608  }
1609 
1610 process_ack:
1611 
1612  /*
1613  * Looks okay, process feedback
1614  */
1615 
1616  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1617  tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
1618 
1619  prev_snd_wnd = tc->snd_wnd;
1620  prev_snd_una = tc->snd_una;
1621  tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
1622  vnet_buffer (b)->tcp.ack_number,
1623  clib_net_to_host_u16 (th->window) << tc->snd_wscale);
1624  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
1625  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
1626  tcp_validate_txf_size (tc, tc->bytes_acked);
1627 
1628  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1629  tcp_bt_sample_delivery_rate (tc, &rs);
1630 
1631  if (tc->bytes_acked)
1632  {
1633  tcp_program_dequeue (wrk, tc);
1634  tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number);
1635  }
1636 
1637  TCP_EVT (TCP_EVT_ACK_RCVD, tc);
1638 
1639  /*
1640  * Check if we have congestion event
1641  */
1642 
1643  if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
1644  {
1645  tcp_cc_handle_event (tc, &rs, is_dack);
1646  tc->dupacks_in += is_dack;
1647  if (!tcp_in_cong_recovery (tc))
1648  {
1649  *error = TCP_ERROR_ACK_OK;
1650  return 0;
1651  }
1652  *error = TCP_ERROR_ACK_DUP;
1653  if (vnet_buffer (b)->tcp.data_len || tcp_is_fin (th))
1654  return 0;
1655  return -1;
1656  }
1657 
1658  /*
1659  * Update congestion control (slow start/congestion avoidance)
1660  */
1661  tcp_cc_update (tc, &rs);
1662  *error = TCP_ERROR_ACK_OK;
1663  return 0;
1664 }
1665 
1666 static void
1668 {
1669  if (!tcp_disconnect_pending (tc))
1670  {
1671  vec_add1 (wrk->pending_disconnects, tc->c_c_index);
1673  }
1674 }
1675 
1676 static void
1678 {
1679  u32 thread_index, *pending_disconnects;
1680  tcp_connection_t *tc;
1681  int i;
1682 
1683  if (!vec_len (wrk->pending_disconnects))
1684  return;
1685 
1686  thread_index = wrk->vm->thread_index;
1687  pending_disconnects = wrk->pending_disconnects;
1688  for (i = 0; i < vec_len (pending_disconnects); i++)
1689  {
1690  tc = tcp_connection_get (pending_disconnects[i], thread_index);
1692  session_transport_closing_notify (&tc->connection);
1693  }
1694  _vec_len (wrk->pending_disconnects) = 0;
1695 }
1696 
1697 static void
1699  u32 * error)
1700 {
1701  /* Reject out-of-order fins */
1702  if (vnet_buffer (b)->tcp.seq_end != tc->rcv_nxt)
1703  return;
1704 
1705  /* Account for the FIN and send ack */
1706  tc->rcv_nxt += 1;
1707  tc->flags |= TCP_CONN_FINRCVD;
1708  tcp_program_ack (tc);
1709  /* Enter CLOSE-WAIT and notify session. To avoid lingering
1710  * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
1711  tcp_connection_set_state (tc, TCP_STATE_CLOSE_WAIT);
1712  tcp_program_disconnect (wrk, tc);
1713  tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, tcp_cfg.closewait_time);
1714  TCP_EVT (TCP_EVT_FIN_RCVD, tc);
1715  *error = TCP_ERROR_FIN_RCVD;
1716 }
1717 
1718 #ifndef CLIB_MARCH_VARIANT
1719 static u8
1721 {
1722  int i;
1723  for (i = 1; i < vec_len (sacks); i++)
1724  {
1725  if (sacks[i - 1].end == sacks[i].start)
1726  return 0;
1727  }
1728  return 1;
1729 }
1730 
1731 /**
1732  * Build SACK list as per RFC2018.
1733  *
1734  * Makes sure the first block contains the segment that generated the current
1735  * ACK and the following ones are the ones most recently reported in SACK
1736  * blocks.
1737  *
1738  * @param tc TCP connection for which the SACK list is updated
1739  * @param start Start sequence number of the newest SACK block
1740  * @param end End sequence of the newest SACK block
1741  */
1742 void
1744 {
1745  sack_block_t *new_list = tc->snd_sacks_fl, *block = 0;
1746  int i;
1747 
1748  /* If the first segment is ooo add it to the list. Last write might've moved
1749  * rcv_nxt over the first segment. */
1750  if (seq_lt (tc->rcv_nxt, start))
1751  {
1752  vec_add2 (new_list, block, 1);
1753  block->start = start;
1754  block->end = end;
1755  }
1756 
1757  /* Find the blocks still worth keeping. */
1758  for (i = 0; i < vec_len (tc->snd_sacks); i++)
1759  {
1760  /* Discard if rcv_nxt advanced beyond current block */
1761  if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
1762  continue;
1763 
1764  /* Merge or drop if segment overlapped by the new segment */
1765  if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
1766  && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
1767  {
1768  if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
1769  new_list[0].start = tc->snd_sacks[i].start;
1770  if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
1771  new_list[0].end = tc->snd_sacks[i].end;
1772  continue;
1773  }
1774 
1775  /* Save to new SACK list if we have space. */
1776  if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
1777  vec_add1 (new_list, tc->snd_sacks[i]);
1778  }
1779 
1780  ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
1781 
1782  /* Replace old vector with new one */
1783  vec_reset_length (tc->snd_sacks);
1784  tc->snd_sacks_fl = tc->snd_sacks;
1785  tc->snd_sacks = new_list;
1786 
1787  /* Segments should not 'touch' */
1788  ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
1789 }
1790 
1791 u32
1793 {
1794  u32 bytes = 0, i;
1795  for (i = 0; i < vec_len (tc->snd_sacks); i++)
1796  bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
1797  return bytes;
1798 }
1799 #endif /* CLIB_MARCH_VARIANT */
1800 
1801 /** Enqueue data for delivery to application */
1802 static int
1804  u16 data_len)
1805 {
1806  int written, error = TCP_ERROR_ENQUEUED;
1807 
1808  ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1809  ASSERT (data_len);
1810  written = session_enqueue_stream_connection (&tc->connection, b, 0,
1811  1 /* queue event */ , 1);
1812  tc->bytes_in += written;
1813 
1814  TCP_EVT (TCP_EVT_INPUT, tc, 0, data_len, written);
1815 
1816  /* Update rcv_nxt */
1817  if (PREDICT_TRUE (written == data_len))
1818  {
1819  tc->rcv_nxt += written;
1820  }
1821  /* If more data written than expected, account for out-of-order bytes. */
1822  else if (written > data_len)
1823  {
1824  tc->rcv_nxt += written;
1825  TCP_EVT (TCP_EVT_CC_INPUT, tc, data_len, written);
1826  }
1827  else if (written > 0)
1828  {
1829  /* We've written something but FIFO is probably full now */
1830  tc->rcv_nxt += written;
1831  error = TCP_ERROR_PARTIALLY_ENQUEUED;
1832  }
1833  else
1834  {
1835  return TCP_ERROR_FIFO_FULL;
1836  }
1837 
1838  /* Update SACK list if need be */
1839  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1840  {
1841  /* Remove SACK blocks that have been delivered */
1842  tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
1843  }
1844 
1845  return error;
1846 }
1847 
1848 /** Enqueue out-of-order data */
1849 static int
1851  u16 data_len)
1852 {
1853  session_t *s0;
1854  int rv, offset;
1855 
1856  ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1857  ASSERT (data_len);
1858 
1859  /* Enqueue out-of-order data with relative offset */
1860  rv = session_enqueue_stream_connection (&tc->connection, b,
1861  vnet_buffer (b)->tcp.seq_number -
1862  tc->rcv_nxt, 0 /* queue event */ ,
1863  0);
1864 
1865  /* Nothing written */
1866  if (rv)
1867  {
1868  TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, 0);
1869  return TCP_ERROR_FIFO_FULL;
1870  }
1871 
1872  TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, data_len);
1873  tc->bytes_in += data_len;
1874 
1875  /* Update SACK list if in use */
1876  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1877  {
1878  ooo_segment_t *newest;
1879  u32 start, end;
1880 
1881  s0 = session_get (tc->c_s_index, tc->c_thread_index);
1882 
1883  /* Get the newest segment from the fifo */
1884  newest = svm_fifo_newest_ooo_segment (s0->rx_fifo);
1885  if (newest)
1886  {
1887  offset = ooo_segment_offset_prod (s0->rx_fifo, newest);
1888  ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
1889  start = tc->rcv_nxt + offset;
1890  end = start + ooo_segment_length (s0->rx_fifo, newest);
1891  tcp_update_sack_list (tc, start, end);
1893  TCP_EVT (TCP_EVT_CC_SACKS, tc);
1894  }
1895  }
1896 
1897  return TCP_ERROR_ENQUEUED_OOO;
1898 }
1899 
1900 /**
1901  * Check if ACK could be delayed. If ack can be delayed, it should return
1902  * true for a full frame. If we're always acking return 0.
1903  */
1904 always_inline int
1906 {
1907  /* Send ack if ... */
1908  if (TCP_ALWAYS_ACK
1909  /* just sent a rcv wnd 0
1910  || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 */
1911  /* constrained to send ack */
1912  || (tc->flags & TCP_CONN_SNDACK) != 0
1913  /* we're almost out of tx wnd */
1914  || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss)
1915  return 0;
1916 
1917  return 1;
1918 }
1919 
1920 static int
1922 {
1923  u32 discard, first = b->current_length;
1924  vlib_main_t *vm = vlib_get_main ();
1925 
1926  /* Handle multi-buffer segments */
1927  if (n_bytes_to_drop > b->current_length)
1928  {
1929  if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
1930  return -1;
1931  do
1932  {
1933  discard = clib_min (n_bytes_to_drop, b->current_length);
1934  vlib_buffer_advance (b, discard);
1935  b = vlib_get_buffer (vm, b->next_buffer);
1936  n_bytes_to_drop -= discard;
1937  }
1938  while (n_bytes_to_drop);
1939  if (n_bytes_to_drop > first)
1940  b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
1941  }
1942  else
1943  vlib_buffer_advance (b, n_bytes_to_drop);
1944  vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
1945  return 0;
1946 }
1947 
1948 /**
1949  * Receive buffer for connection and handle acks
1950  *
1951  * It handles both in order or out-of-order data.
1952  */
1953 static int
1955  vlib_buffer_t * b)
1956 {
1957  u32 error, n_bytes_to_drop, n_data_bytes;
1958 
1959  vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
1960  n_data_bytes = vnet_buffer (b)->tcp.data_len;
1961  ASSERT (n_data_bytes);
1962  tc->data_segs_in += 1;
1963 
1964  /* Handle out-of-order data */
1965  if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
1966  {
1967  /* Old sequence numbers allowed through because they overlapped
1968  * the rx window */
1969  if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
1970  {
1971  /* Completely in the past (possible retransmit). Ack
1972  * retransmissions since we may not have any data to send */
1973  if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
1974  {
1975  tcp_program_ack (tc);
1976  error = TCP_ERROR_SEGMENT_OLD;
1977  goto done;
1978  }
1979 
1980  /* Chop off the bytes in the past and see if what is left
1981  * can be enqueued in order */
1982  n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
1983  n_data_bytes -= n_bytes_to_drop;
1984  vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
1985  if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
1986  {
1987  error = TCP_ERROR_SEGMENT_OLD;
1988  goto done;
1989  }
1990  goto in_order;
1991  }
1992 
1993  /* RFC2581: Enqueue and send DUPACK for fast retransmit */
1994  error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
1995  tcp_program_dupack (tc);
1996  TCP_EVT (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
1997  tc->errors.above_data_wnd += seq_gt (vnet_buffer (b)->tcp.seq_end,
1998  tc->rcv_las + tc->rcv_wnd);
1999  goto done;
2000  }
2001 
2002 in_order:
2003 
2004  /* In order data, enqueue. Fifo figures out by itself if any out-of-order
2005  * segments can be enqueued after fifo tail offset changes. */
2006  error = tcp_session_enqueue_data (tc, b, n_data_bytes);
2007  if (tcp_can_delack (tc))
2008  {
2009  if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
2010  tcp_timer_set (tc, TCP_TIMER_DELACK, tcp_cfg.delack_time);
2011  goto done;
2012  }
2013 
2014  tcp_program_ack (tc);
2015 
2016 done:
2017  return error;
2018 }
2019 
2020 typedef struct
2021 {
2024 } tcp_rx_trace_t;
2025 
2026 static u8 *
2027 format_tcp_rx_trace (u8 * s, va_list * args)
2028 {
2029  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
2030  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
2031  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
2032  tcp_connection_t *tc = &t->tcp_connection;
2033  u32 indent = format_get_indent (s);
2034 
2035  s = format (s, "%U state %U\n%U%U", format_tcp_connection_id, tc,
2036  format_tcp_state, tc->state, format_white_space, indent,
2037  format_tcp_header, &t->tcp_header, 128);
2038 
2039  return s;
2040 }
2041 
2042 static u8 *
2043 format_tcp_rx_trace_short (u8 * s, va_list * args)
2044 {
2045  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
2046  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
2047  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
2048 
2049  s = format (s, "%d -> %d (%U)",
2050  clib_net_to_host_u16 (t->tcp_header.dst_port),
2051  clib_net_to_host_u16 (t->tcp_header.src_port), format_tcp_state,
2052  t->tcp_connection.state);
2053 
2054  return s;
2055 }
2056 
2057 static void
2059  tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
2060 {
2061  if (tc0)
2062  {
2063  clib_memcpy_fast (&t0->tcp_connection, tc0,
2064  sizeof (t0->tcp_connection));
2065  }
2066  else
2067  {
2068  th0 = tcp_buffer_hdr (b0);
2069  }
2070  clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
2071 }
2072 
2073 static void
2076 {
2077  u32 *from, n_left;
2078 
2079  n_left = frame->n_vectors;
2080  from = vlib_frame_vector_args (frame);
2081 
2082  while (n_left >= 1)
2083  {
2084  tcp_connection_t *tc0;
2085  tcp_rx_trace_t *t0;
2086  tcp_header_t *th0;
2087  vlib_buffer_t *b0;
2088  u32 bi0;
2089 
2090  bi0 = from[0];
2091  b0 = vlib_get_buffer (vm, bi0);
2092 
2093  if (b0->flags & VLIB_BUFFER_IS_TRACED)
2094  {
2095  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2096  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2097  vm->thread_index);
2098  th0 = tcp_buffer_hdr (b0);
2099  tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
2100  }
2101 
2102  from += 1;
2103  n_left -= 1;
2104  }
2105 }
2106 
2107 always_inline void
2108 tcp_node_inc_counter_i (vlib_main_t * vm, u32 tcp4_node, u32 tcp6_node,
2109  u8 is_ip4, u32 evt, u32 val)
2110 {
2111  if (is_ip4)
2112  vlib_node_increment_counter (vm, tcp4_node, evt, val);
2113  else
2114  vlib_node_increment_counter (vm, tcp6_node, evt, val);
2115 }
2116 
2117 #define tcp_maybe_inc_counter(node_id, err, count) \
2118 { \
2119  if (next0 != tcp_next_drop (is_ip4)) \
2120  tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
2121  tcp6_##node_id##_node.index, is_ip4, err, \
2122  1); \
2123 }
2124 #define tcp_inc_counter(node_id, err, count) \
2125  tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
2126  tcp6_##node_id##_node.index, is_ip4, \
2127  err, count)
2128 #define tcp_maybe_inc_err_counter(cnts, err) \
2129 { \
2130  cnts[err] += (next0 != tcp_next_drop (is_ip4)); \
2131 }
2132 #define tcp_inc_err_counter(cnts, err, val) \
2133 { \
2134  cnts[err] += val; \
2135 }
2136 #define tcp_store_err_counters(node_id, cnts) \
2137 { \
2138  int i; \
2139  for (i = 0; i < TCP_N_ERROR; i++) \
2140  if (cnts[i]) \
2141  tcp_inc_counter(node_id, i, cnts[i]); \
2142 }
2143 
2144 
2147  vlib_frame_t * frame, int is_ip4)
2148 {
2149  u32 thread_index = vm->thread_index, errors = 0;
2150  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
2151  u32 n_left_from, *from, *first_buffer;
2152  u16 err_counters[TCP_N_ERROR] = { 0 };
2153 
2154  if (node->flags & VLIB_NODE_FLAG_TRACE)
2155  tcp_established_trace_frame (vm, node, frame, is_ip4);
2156 
2157  first_buffer = from = vlib_frame_vector_args (frame);
2158  n_left_from = frame->n_vectors;
2159 
2160  while (n_left_from > 0)
2161  {
2162  u32 bi0, error0 = TCP_ERROR_ACK_OK;
2163  vlib_buffer_t *b0;
2164  tcp_header_t *th0;
2165  tcp_connection_t *tc0;
2166 
2167  if (n_left_from > 1)
2168  {
2169  vlib_buffer_t *pb;
2170  pb = vlib_get_buffer (vm, from[1]);
2171  vlib_prefetch_buffer_header (pb, LOAD);
2172  CLIB_PREFETCH (pb->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
2173  }
2174 
2175  bi0 = from[0];
2176  from += 1;
2177  n_left_from -= 1;
2178 
2179  b0 = vlib_get_buffer (vm, bi0);
2180  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2181  thread_index);
2182 
2183  if (PREDICT_FALSE (tc0 == 0))
2184  {
2185  error0 = TCP_ERROR_INVALID_CONNECTION;
2186  goto done;
2187  }
2188 
2189  th0 = tcp_buffer_hdr (b0);
2190 
2191  /* TODO header prediction fast path */
2192 
2193  /* 1-4: check SEQ, RST, SYN */
2194  if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, th0, &error0)))
2195  {
2196  TCP_EVT (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp);
2197  goto done;
2198  }
2199 
2200  /* 5: check the ACK field */
2201  if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &error0)))
2202  goto done;
2203 
2204  /* 6: check the URG bit TODO */
2205 
2206  /* 7: process the segment text */
2207  if (vnet_buffer (b0)->tcp.data_len)
2208  error0 = tcp_segment_rcv (wrk, tc0, b0);
2209 
2210  /* 8: check the FIN bit */
2211  if (PREDICT_FALSE (tcp_is_fin (th0)))
2212  tcp_rcv_fin (wrk, tc0, b0, &error0);
2213 
2214  done:
2215  tcp_inc_err_counter (err_counters, error0, 1);
2216  }
2217 
2218  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
2219  thread_index);
2220  err_counters[TCP_ERROR_MSG_QUEUE_FULL] = errors;
2221  tcp_store_err_counters (established, err_counters);
2223  tcp_handle_disconnects (wrk);
2224  vlib_buffer_free (vm, first_buffer, frame->n_vectors);
2225 
2226  return frame->n_vectors;
2227 }
2228 
2231  vlib_frame_t * from_frame)
2232 {
2233  return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2234 }
2235 
2238  vlib_frame_t * from_frame)
2239 {
2240  return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2241 }
2242 
2243 /* *INDENT-OFF* */
2245 {
2246  .name = "tcp4-established",
2247  /* Takes a vector of packets. */
2248  .vector_size = sizeof (u32),
2249  .n_errors = TCP_N_ERROR,
2250  .error_strings = tcp_error_strings,
2251  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
2252  .next_nodes =
2253  {
2254 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
2256 #undef _
2257  },
2258  .format_trace = format_tcp_rx_trace_short,
2259 };
2260 /* *INDENT-ON* */
2261 
2262 /* *INDENT-OFF* */
2264 {
2265  .name = "tcp6-established",
2266  /* Takes a vector of packets. */
2267  .vector_size = sizeof (u32),
2268  .n_errors = TCP_N_ERROR,
2269  .error_strings = tcp_error_strings,
2270  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
2271  .next_nodes =
2272  {
2273 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
2275 #undef _
2276  },
2277  .format_trace = format_tcp_rx_trace_short,
2278 };
2279 /* *INDENT-ON* */
2280 
2281 
2282 static u8
2284  tcp_header_t * hdr)
2285 {
2286  transport_connection_t *tmp = 0;
2287  u64 handle;
2288 
2289  if (!tc)
2290  return 1;
2291 
2292  /* Proxy case */
2293  if (tc->c_lcl_port == 0 && tc->state == TCP_STATE_LISTEN)
2294  return 1;
2295 
2296  u8 is_ip_valid = 0, val_l, val_r;
2297 
2298  if (tc->connection.is_ip4)
2299  {
2301 
2302  val_l = !ip4_address_compare (&ip4_hdr->dst_address,
2303  &tc->connection.lcl_ip.ip4);
2304  val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 1);
2305  val_r = !ip4_address_compare (&ip4_hdr->src_address,
2306  &tc->connection.rmt_ip.ip4);
2307  val_r = val_r || tc->state == TCP_STATE_LISTEN;
2308  is_ip_valid = val_l && val_r;
2309  }
2310  else
2311  {
2313 
2314  val_l = !ip6_address_compare (&ip6_hdr->dst_address,
2315  &tc->connection.lcl_ip.ip6);
2316  val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 0);
2317  val_r = !ip6_address_compare (&ip6_hdr->src_address,
2318  &tc->connection.rmt_ip.ip6);
2319  val_r = val_r || tc->state == TCP_STATE_LISTEN;
2320  is_ip_valid = val_l && val_r;
2321  }
2322 
2323  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
2324  && (tc->state == TCP_STATE_LISTEN
2325  || tc->c_rmt_port == hdr->src_port) && is_ip_valid);
2326 
2327  if (!is_valid)
2328  {
2329  handle = session_lookup_half_open_handle (&tc->connection);
2330  tmp = session_lookup_half_open_connection (handle & 0xFFFFFFFF,
2331  tc->c_proto, tc->c_is_ip4);
2332 
2333  if (tmp)
2334  {
2335  if (tmp->lcl_port == hdr->dst_port
2336  && tmp->rmt_port == hdr->src_port)
2337  {
2338  TCP_DBG ("half-open is valid!");
2339  is_valid = 1;
2340  }
2341  }
2342  }
2343  return is_valid;
2344 }
2345 
2346 /**
2347  * Lookup transport connection
2348  */
2349 static tcp_connection_t *
2350 tcp_lookup_connection (u32 fib_index, vlib_buffer_t * b, u8 thread_index,
2351  u8 is_ip4)
2352 {
2353  tcp_header_t *tcp;
2354  transport_connection_t *tconn;
2355  tcp_connection_t *tc;
2356  u8 is_filtered = 0;
2357  if (is_ip4)
2358  {
2359  ip4_header_t *ip4;
2360  ip4 = vlib_buffer_get_current (b);
2361  tcp = ip4_next_header (ip4);
2362  tconn = session_lookup_connection_wt4 (fib_index,
2363  &ip4->dst_address,
2364  &ip4->src_address,
2365  tcp->dst_port,
2366  tcp->src_port,
2367  TRANSPORT_PROTO_TCP,
2368  thread_index, &is_filtered);
2369  tc = tcp_get_connection_from_transport (tconn);
2370  ASSERT (tcp_lookup_is_valid (tc, b, tcp));
2371  }
2372  else
2373  {
2374  ip6_header_t *ip6;
2375  ip6 = vlib_buffer_get_current (b);
2376  tcp = ip6_next_header (ip6);
2377  tconn = session_lookup_connection_wt6 (fib_index,
2378  &ip6->dst_address,
2379  &ip6->src_address,
2380  tcp->dst_port,
2381  tcp->src_port,
2382  TRANSPORT_PROTO_TCP,
2383  thread_index, &is_filtered);
2384  tc = tcp_get_connection_from_transport (tconn);
2385  ASSERT (tcp_lookup_is_valid (tc, b, tcp));
2386  }
2387  return tc;
2388 }
2389 
2390 static tcp_connection_t *
2392 {
2393  session_t *s;
2394 
2395  if (is_ip4)
2396  {
2398  tcp_header_t *tcp = tcp_buffer_hdr (b);
2399  s = session_lookup_listener4 (fib_index,
2400  &ip4->dst_address,
2401  tcp->dst_port, TRANSPORT_PROTO_TCP, 1);
2402  }
2403  else
2404  {
2406  tcp_header_t *tcp = tcp_buffer_hdr (b);
2407  s = session_lookup_listener6 (fib_index,
2408  &ip6->dst_address,
2409  tcp->dst_port, TRANSPORT_PROTO_TCP, 1);
2410 
2411  }
2412  if (PREDICT_TRUE (s != 0))
2414  (TRANSPORT_PROTO_TCP,
2415  s->connection_index));
2416  else
2417  return 0;
2418 }
2419 
2420 always_inline void
2422 {
2423  vnet_main_t *vnm = vnet_get_main ();
2424  const dpo_id_t *dpo;
2425  const load_balance_t *lb;
2426  vnet_hw_interface_t *hw_if;
2427  u32 sw_if_idx, lb_idx;
2428 
2429  if (is_ipv4)
2430  {
2431  ip4_address_t *dst_addr = &(tc->c_rmt_ip.ip4);
2432  lb_idx = ip4_fib_forwarding_lookup (tc->c_fib_index, dst_addr);
2433  }
2434  else
2435  {
2436  ip6_address_t *dst_addr = &(tc->c_rmt_ip.ip6);
2437  lb_idx = ip6_fib_table_fwding_lookup (tc->c_fib_index, dst_addr);
2438  }
2439 
2440  lb = load_balance_get (lb_idx);
2441  if (PREDICT_FALSE (lb->lb_n_buckets > 1))
2442  return;
2443  dpo = load_balance_get_bucket_i (lb, 0);
2444 
2445  sw_if_idx = dpo_get_urpf (dpo);
2446  if (PREDICT_FALSE (sw_if_idx == ~0))
2447  return;
2448 
2449  hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
2451  tc->cfg_flags |= TCP_CFG_F_TSO;
2452 }
2453 
2456  vlib_frame_t * from_frame, int is_ip4)
2457 {
2458  u32 n_left_from, *from, *first_buffer, errors = 0;
2459  u32 my_thread_index = vm->thread_index;
2460  tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index);
2461 
2462  from = first_buffer = vlib_frame_vector_args (from_frame);
2463  n_left_from = from_frame->n_vectors;
2464 
2465  while (n_left_from > 0)
2466  {
2467  u32 bi0, ack0, seq0, error0 = TCP_ERROR_NONE;
2468  tcp_connection_t *tc0, *new_tc0;
2469  tcp_header_t *tcp0 = 0;
2470  tcp_rx_trace_t *t0;
2471  vlib_buffer_t *b0;
2472 
2473  bi0 = from[0];
2474  from += 1;
2475  n_left_from -= 1;
2476 
2477  b0 = vlib_get_buffer (vm, bi0);
2478  tc0 =
2479  tcp_half_open_connection_get (vnet_buffer (b0)->tcp.connection_index);
2480  if (PREDICT_FALSE (tc0 == 0))
2481  {
2482  error0 = TCP_ERROR_INVALID_CONNECTION;
2483  goto drop;
2484  }
2485 
2486  /* Half-open completed recently but the connection was't removed
2487  * yet by the owning thread */
2488  if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
2489  {
2490  /* Make sure the connection actually exists */
2491  ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0,
2492  my_thread_index, is_ip4));
2493  error0 = TCP_ERROR_SPURIOUS_SYN_ACK;
2494  goto drop;
2495  }
2496 
2497  ack0 = vnet_buffer (b0)->tcp.ack_number;
2498  seq0 = vnet_buffer (b0)->tcp.seq_number;
2499  tcp0 = tcp_buffer_hdr (b0);
2500 
2501  /* Crude check to see if the connection handle does not match
2502  * the packet. Probably connection just switched to established */
2503  if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
2504  || tcp0->src_port != tc0->c_rmt_port))
2505  {
2506  error0 = TCP_ERROR_INVALID_CONNECTION;
2507  goto drop;
2508  }
2509 
2510  if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0)
2511  && !tcp_syn (tcp0)))
2512  {
2513  error0 = TCP_ERROR_SEGMENT_INVALID;
2514  goto drop;
2515  }
2516 
2517  /* SYNs consume sequence numbers */
2518  vnet_buffer (b0)->tcp.seq_end += tcp_is_syn (tcp0);
2519 
2520  /*
2521  * 1. check the ACK bit
2522  */
2523 
2524  /*
2525  * If the ACK bit is set
2526  * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
2527  * the RST bit is set, if so drop the segment and return)
2528  * <SEQ=SEG.ACK><CTL=RST>
2529  * and discard the segment. Return.
2530  * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
2531  */
2532  if (tcp_ack (tcp0))
2533  {
2534  if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
2535  {
2536  if (!tcp_rst (tcp0))
2537  tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
2538  error0 = TCP_ERROR_RCV_WND;
2539  goto drop;
2540  }
2541 
2542  /* Make sure ACK is valid */
2543  if (seq_gt (tc0->snd_una, ack0))
2544  {
2545  error0 = TCP_ERROR_ACK_INVALID;
2546  goto drop;
2547  }
2548  }
2549 
2550  /*
2551  * 2. check the RST bit
2552  */
2553 
2554  if (tcp_rst (tcp0))
2555  {
2556  /* If ACK is acceptable, signal client that peer is not
2557  * willing to accept connection and drop connection*/
2558  if (tcp_ack (tcp0))
2559  tcp_connection_reset (tc0);
2560  error0 = TCP_ERROR_RST_RCVD;
2561  goto drop;
2562  }
2563 
2564  /*
2565  * 3. check the security and precedence (skipped)
2566  */
2567 
2568  /*
2569  * 4. check the SYN bit
2570  */
2571 
2572  /* No SYN flag. Drop. */
2573  if (!tcp_syn (tcp0))
2574  {
2575  error0 = TCP_ERROR_SEGMENT_INVALID;
2576  goto drop;
2577  }
2578 
2579  /* Parse options */
2580  if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1))
2581  {
2582  error0 = TCP_ERROR_OPTIONS;
2583  goto drop;
2584  }
2585 
2586  /* Valid SYN or SYN-ACK. Move connection from half-open pool to
2587  * current thread pool. */
2588  new_tc0 = tcp_connection_alloc_w_base (my_thread_index, tc0);
2589  new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
2590  new_tc0->irs = seq0;
2591  new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
2592  new_tc0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
2593 
2594  /* If this is not the owning thread, wait for syn retransmit to
2595  * expire and cleanup then */
2597  tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
2598 
2599  if (tcp_opts_tstamp (&new_tc0->rcv_opts))
2600  {
2601  new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
2602  new_tc0->tsval_recent_age = tcp_time_now ();
2603  }
2604 
2605  if (tcp_opts_wscale (&new_tc0->rcv_opts))
2606  new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
2607  else
2608  new_tc0->rcv_wscale = 0;
2609 
2610  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2611  << new_tc0->snd_wscale;
2612  new_tc0->snd_wl1 = seq0;
2613  new_tc0->snd_wl2 = ack0;
2614 
2615  tcp_connection_init_vars (new_tc0);
2616 
2617  /* SYN-ACK: See if we can switch to ESTABLISHED state */
2618  if (PREDICT_TRUE (tcp_ack (tcp0)))
2619  {
2620  /* Our SYN is ACKed: we have iss < ack = snd_una */
2621 
2622  /* TODO Dequeue acknowledged segments if we support Fast Open */
2623  new_tc0->snd_una = ack0;
2624  new_tc0->state = TCP_STATE_ESTABLISHED;
2625 
2626  /* Make sure las is initialized for the wnd computation */
2627  new_tc0->rcv_las = new_tc0->rcv_nxt;
2628 
2629  /* Notify app that we have connection. If session layer can't
2630  * allocate session send reset */
2631  if (session_stream_connect_notify (&new_tc0->connection, 0))
2632  {
2633  tcp_send_reset_w_pkt (new_tc0, b0, my_thread_index, is_ip4);
2634  tcp_connection_cleanup (new_tc0);
2635  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2636  goto drop;
2637  }
2638 
2639  new_tc0->tx_fifo_size =
2640  transport_tx_fifo_size (&new_tc0->connection);
2641  /* Update rtt with the syn-ack sample */
2642  tcp_estimate_initial_rtt (new_tc0);
2643  TCP_EVT (TCP_EVT_SYNACK_RCVD, new_tc0);
2644  error0 = TCP_ERROR_SYN_ACKS_RCVD;
2645  }
2646  /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
2647  else
2648  {
2649  new_tc0->state = TCP_STATE_SYN_RCVD;
2650 
2651  /* Notify app that we have connection */
2652  if (session_stream_connect_notify (&new_tc0->connection, 0))
2653  {
2654  tcp_connection_cleanup (new_tc0);
2655  tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
2656  TCP_EVT (TCP_EVT_RST_SENT, tc0);
2657  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2658  goto drop;
2659  }
2660 
2661  new_tc0->tx_fifo_size =
2662  transport_tx_fifo_size (&new_tc0->connection);
2663  new_tc0->rtt_ts = 0;
2664  tcp_init_snd_vars (new_tc0);
2665  tcp_send_synack (new_tc0);
2666  error0 = TCP_ERROR_SYNS_RCVD;
2667  goto drop;
2668  }
2669 
2670  if (!(new_tc0->cfg_flags & TCP_CFG_F_NO_TSO))
2671  tcp_check_tx_offload (new_tc0, is_ip4);
2672 
2673  /* Read data, if any */
2674  if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
2675  {
2676  clib_warning ("rcvd data in syn-sent");
2677  error0 = tcp_segment_rcv (wrk, new_tc0, b0);
2678  if (error0 == TCP_ERROR_ACK_OK)
2679  error0 = TCP_ERROR_SYN_ACKS_RCVD;
2680  }
2681  else
2682  {
2683  /* Send ack now instead of programming it because connection was
2684  * just established and it's not optional. */
2685  tcp_send_ack (new_tc0);
2686  }
2687 
2688  drop:
2689 
2690  tcp_inc_counter (syn_sent, error0, 1);
2691  if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0))
2692  {
2693  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2694  clib_memcpy_fast (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
2695  clib_memcpy_fast (&t0->tcp_connection, tc0,
2696  sizeof (t0->tcp_connection));
2697  }
2698  }
2699 
2700  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
2701  my_thread_index);
2702  tcp_inc_counter (syn_sent, TCP_ERROR_MSG_QUEUE_FULL, errors);
2703  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
2704 
2705  return from_frame->n_vectors;
2706 }
2707 
2710  vlib_frame_t * from_frame)
2711 {
2712  return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2713 }
2714 
2717  vlib_frame_t * from_frame)
2718 {
2719  return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2720 }
2721 
2722 /* *INDENT-OFF* */
2724 {
2725  .name = "tcp4-syn-sent",
2726  /* Takes a vector of packets. */
2727  .vector_size = sizeof (u32),
2728  .n_errors = TCP_N_ERROR,
2729  .error_strings = tcp_error_strings,
2730  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2731  .next_nodes =
2732  {
2733 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2735 #undef _
2736  },
2737  .format_trace = format_tcp_rx_trace_short,
2738 };
2739 /* *INDENT-ON* */
2740 
2741 /* *INDENT-OFF* */
2743 {
2744  .name = "tcp6-syn-sent",
2745  /* Takes a vector of packets. */
2746  .vector_size = sizeof (u32),
2747  .n_errors = TCP_N_ERROR,
2748  .error_strings = tcp_error_strings,
2749  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2750  .next_nodes =
2751  {
2752 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2754 #undef _
2755  },
2756  .format_trace = format_tcp_rx_trace_short,
2757 };
2758 /* *INDENT-ON* */
2759 
2760 /**
2761  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
2762  * as per RFC793 p. 64
2763  */
2766  vlib_frame_t * from_frame, int is_ip4)
2767 {
2768  u32 thread_index = vm->thread_index, errors = 0, *first_buffer;
2769  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
2770  u32 n_left_from, *from, max_dequeue;
2771 
2772  from = first_buffer = vlib_frame_vector_args (from_frame);
2773  n_left_from = from_frame->n_vectors;
2774 
2775  while (n_left_from > 0)
2776  {
2777  u32 bi0, error0 = TCP_ERROR_NONE;
2778  tcp_header_t *tcp0 = 0;
2779  tcp_connection_t *tc0;
2780  vlib_buffer_t *b0;
2781  u8 is_fin0;
2782 
2783  bi0 = from[0];
2784  from += 1;
2785  n_left_from -= 1;
2786 
2787  b0 = vlib_get_buffer (vm, bi0);
2788  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2789  thread_index);
2790  if (PREDICT_FALSE (tc0 == 0))
2791  {
2792  error0 = TCP_ERROR_INVALID_CONNECTION;
2793  goto drop;
2794  }
2795 
2796  tcp0 = tcp_buffer_hdr (b0);
2797  is_fin0 = tcp_is_fin (tcp0);
2798 
2799  if (CLIB_DEBUG)
2800  {
2801  if (!(tc0->connection.flags & TRANSPORT_CONNECTION_F_NO_LOOKUP))
2802  {
2803  tcp_connection_t *tmp;
2804  tmp = tcp_lookup_connection (tc0->c_fib_index, b0, thread_index,
2805  is_ip4);
2806  if (tmp->state != tc0->state)
2807  {
2808  if (tc0->state != TCP_STATE_CLOSED)
2809  clib_warning ("state changed");
2810  goto drop;
2811  }
2812  }
2813  }
2814 
2815  /*
2816  * Special treatment for CLOSED
2817  */
2818  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
2819  {
2820  error0 = TCP_ERROR_CONNECTION_CLOSED;
2821  goto drop;
2822  }
2823 
2824  /*
2825  * For all other states (except LISTEN)
2826  */
2827 
2828  /* 1-4: check SEQ, RST, SYN */
2829  if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, tcp0, &error0)))
2830  goto drop;
2831 
2832  /* 5: check the ACK field */
2833  switch (tc0->state)
2834  {
2835  case TCP_STATE_SYN_RCVD:
2836 
2837  /* Make sure the segment is exactly right */
2838  if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
2839  {
2840  tcp_connection_reset (tc0);
2841  error0 = TCP_ERROR_SEGMENT_INVALID;
2842  goto drop;
2843  }
2844 
2845  /*
2846  * If the segment acknowledgment is not acceptable, form a
2847  * reset segment,
2848  * <SEQ=SEG.ACK><CTL=RST>
2849  * and send it.
2850  */
2851  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2852  {
2853  tcp_connection_reset (tc0);
2854  goto drop;
2855  }
2856 
2857  /* Update rtt and rto */
2860 
2861  /* Switch state to ESTABLISHED */
2862  tc0->state = TCP_STATE_ESTABLISHED;
2863  TCP_EVT (TCP_EVT_STATE_CHANGE, tc0);
2864 
2865  if (!(tc0->cfg_flags & TCP_CFG_F_NO_TSO))
2866  tcp_check_tx_offload (tc0, is_ip4);
2867 
2868  /* Initialize session variables */
2869  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
2870  tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2871  << tc0->rcv_opts.wscale;
2872  tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2873  tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2874 
2875  /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
2877  if (session_stream_accept_notify (&tc0->connection))
2878  {
2879  error0 = TCP_ERROR_MSG_QUEUE_FULL;
2880  tcp_connection_reset (tc0);
2881  goto drop;
2882  }
2883  error0 = TCP_ERROR_ACK_OK;
2884  break;
2885  case TCP_STATE_ESTABLISHED:
2886  /* We can get packets in established state here because they
2887  * were enqueued before state change */
2888  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2889  goto drop;
2890 
2891  break;
2892  case TCP_STATE_FIN_WAIT_1:
2893  /* In addition to the processing for the ESTABLISHED state, if
2894  * our FIN is now acknowledged then enter FIN-WAIT-2 and
2895  * continue processing in that state. */
2896  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2897  goto drop;
2898 
2899  /* Still have to send the FIN */
2900  if (tc0->flags & TCP_CONN_FINPNDG)
2901  {
2902  /* TX fifo finally drained */
2903  max_dequeue = transport_max_tx_dequeue (&tc0->connection);
2904  if (max_dequeue <= tc0->burst_acked)
2905  tcp_send_fin (tc0);
2906  /* If a fin was received and data was acked extend wait */
2907  else if ((tc0->flags & TCP_CONN_FINRCVD) && tc0->bytes_acked)
2908  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
2909  tcp_cfg.closewait_time);
2910  }
2911  /* If FIN is ACKed */
2912  else if (tc0->snd_una == tc0->snd_nxt)
2913  {
2914  /* Stop all retransmit timers because we have nothing more
2915  * to send. */
2917 
2918  /* We already have a FIN but didn't transition to CLOSING
2919  * because of outstanding tx data. Close the connection. */
2920  if (tc0->flags & TCP_CONN_FINRCVD)
2921  {
2922  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
2923  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE,
2924  tcp_cfg.cleanup_time);
2925  session_transport_closed_notify (&tc0->connection);
2926  goto drop;
2927  }
2928 
2929  tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
2930  /* Enable waitclose because we're willing to wait for peer's
2931  * FIN but not indefinitely. */
2932  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.finwait2_time);
2933 
2934  /* Don't try to deq the FIN acked */
2935  if (tc0->burst_acked > 1)
2936  session_tx_fifo_dequeue_drop (&tc0->connection,
2937  tc0->burst_acked - 1);
2938  tc0->burst_acked = 0;
2939  }
2940  break;
2941  case TCP_STATE_FIN_WAIT_2:
2942  /* In addition to the processing for the ESTABLISHED state, if
2943  * the retransmission queue is empty, the user's CLOSE can be
2944  * acknowledged ("ok") but do not delete the TCB. */
2945  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2946  goto drop;
2947  tc0->burst_acked = 0;
2948  break;
2949  case TCP_STATE_CLOSE_WAIT:
2950  /* Do the same processing as for the ESTABLISHED state. */
2951  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2952  goto drop;
2953 
2954  if (!(tc0->flags & TCP_CONN_FINPNDG))
2955  break;
2956 
2957  /* Still have outstanding tx data */
2958  max_dequeue = transport_max_tx_dequeue (&tc0->connection);
2959  if (max_dequeue > tc0->burst_acked)
2960  break;
2961 
2962  tcp_send_fin (tc0);
2964  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
2965  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
2966  break;
2967  case TCP_STATE_CLOSING:
2968  /* In addition to the processing for the ESTABLISHED state, if
2969  * the ACK acknowledges our FIN then enter the TIME-WAIT state,
2970  * otherwise ignore the segment. */
2971  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2972  goto drop;
2973 
2974  if (tc0->snd_una != tc0->snd_nxt)
2975  goto drop;
2976 
2978  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
2979  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
2980  session_transport_closed_notify (&tc0->connection);
2981  goto drop;
2982 
2983  break;
2984  case TCP_STATE_LAST_ACK:
2985  /* The only thing that [should] arrive in this state is an
2986  * acknowledgment of our FIN. If our FIN is now acknowledged,
2987  * delete the TCB, enter the CLOSED state, and return. */
2988 
2989  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2990  goto drop;
2991 
2992  /* Apparently our ACK for the peer's FIN was lost */
2993  if (is_fin0 && tc0->snd_una != tc0->snd_nxt)
2994  {
2995  tcp_send_fin (tc0);
2996  goto drop;
2997  }
2998 
2999  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
3000  session_transport_closed_notify (&tc0->connection);
3001 
3002  /* Don't free the connection from the data path since
3003  * we can't ensure that we have no packets already enqueued
3004  * to output. Rely instead on the waitclose timer */
3006  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.cleanup_time);
3007 
3008  goto drop;
3009 
3010  break;
3011  case TCP_STATE_TIME_WAIT:
3012  /* The only thing that can arrive in this state is a
3013  * retransmission of the remote FIN. Acknowledge it, and restart
3014  * the 2 MSL timeout. */
3015 
3016  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
3017  goto drop;
3018 
3019  if (!is_fin0)
3020  goto drop;
3021 
3022  tcp_program_ack (tc0);
3023  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
3024  goto drop;
3025 
3026  break;
3027  default:
3028  ASSERT (0);
3029  }
3030 
3031  /* 6: check the URG bit TODO */
3032 
3033  /* 7: process the segment text */
3034  switch (tc0->state)
3035  {
3036  case TCP_STATE_ESTABLISHED:
3037  case TCP_STATE_FIN_WAIT_1:
3038  case TCP_STATE_FIN_WAIT_2:
3039  if (vnet_buffer (b0)->tcp.data_len)
3040  error0 = tcp_segment_rcv (wrk, tc0, b0);
3041  break;
3042  case TCP_STATE_CLOSE_WAIT:
3043  case TCP_STATE_CLOSING:
3044  case TCP_STATE_LAST_ACK:
3045  case TCP_STATE_TIME_WAIT:
3046  /* This should not occur, since a FIN has been received from the
3047  * remote side. Ignore the segment text. */
3048  break;
3049  }
3050 
3051  /* 8: check the FIN bit */
3052  if (!is_fin0)
3053  goto drop;
3054 
3055  TCP_EVT (TCP_EVT_FIN_RCVD, tc0);
3056 
3057  switch (tc0->state)
3058  {
3059  case TCP_STATE_ESTABLISHED:
3060  /* Account for the FIN and send ack */
3061  tc0->rcv_nxt += 1;
3062  tcp_program_ack (tc0);
3063  tcp_connection_set_state (tc0, TCP_STATE_CLOSE_WAIT);
3064  tcp_program_disconnect (wrk, tc0);
3065  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.closewait_time);
3066  break;
3067  case TCP_STATE_SYN_RCVD:
3068  /* Send FIN-ACK, enter LAST-ACK and because the app was not
3069  * notified yet, set a cleanup timer instead of relying on
3070  * disconnect notify and the implicit close call. */
3072  tc0->rcv_nxt += 1;
3073  tcp_send_fin (tc0);
3074  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
3075  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
3076  break;
3077  case TCP_STATE_CLOSE_WAIT:
3078  case TCP_STATE_CLOSING:
3079  case TCP_STATE_LAST_ACK:
3080  /* move along .. */
3081  break;
3082  case TCP_STATE_FIN_WAIT_1:
3083  tc0->rcv_nxt += 1;
3084 
3085  if (tc0->flags & TCP_CONN_FINPNDG)
3086  {
3087  /* If data is outstanding, stay in FIN_WAIT_1 and try to finish
3088  * sending it. Since we already received a fin, do not wait
3089  * for too long. */
3090  tc0->flags |= TCP_CONN_FINRCVD;
3091  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
3092  tcp_cfg.closewait_time);
3093  }
3094  else
3095  {
3096  tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
3097  tcp_program_ack (tc0);
3098  /* Wait for ACK for our FIN but not forever */
3099  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
3100  tcp_cfg.closing_time);
3101  }
3102  break;
3103  case TCP_STATE_FIN_WAIT_2:
3104  /* Got FIN, send ACK! Be more aggressive with resource cleanup */
3105  tc0->rcv_nxt += 1;
3106  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
3108  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
3109  tcp_program_ack (tc0);
3110  session_transport_closed_notify (&tc0->connection);
3111  break;
3112  case TCP_STATE_TIME_WAIT:
3113  /* Remain in the TIME-WAIT state. Restart the time-wait
3114  * timeout.
3115  */
3116  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
3117  break;
3118  }
3119  error0 = TCP_ERROR_FIN_RCVD;
3120 
3121  drop:
3122 
3123  tcp_inc_counter (rcv_process, error0, 1);
3124  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
3125  {
3126  tcp_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
3127  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
3128  }
3129  }
3130 
3131  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
3132  thread_index);
3133  tcp_inc_counter (rcv_process, TCP_ERROR_MSG_QUEUE_FULL, errors);
3135  tcp_handle_disconnects (wrk);
3136  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
3137 
3138  return from_frame->n_vectors;
3139 }
3140 
3143  vlib_frame_t * from_frame)
3144 {
3145  return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
3146 }
3147 
3150  vlib_frame_t * from_frame)
3151 {
3152  return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
3153 }
3154 
3155 /* *INDENT-OFF* */
3157 {
3158  .name = "tcp4-rcv-process",
3159  /* Takes a vector of packets. */
3160  .vector_size = sizeof (u32),
3161  .n_errors = TCP_N_ERROR,
3162  .error_strings = tcp_error_strings,
3163  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
3164  .next_nodes =
3165  {
3166 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
3168 #undef _
3169  },
3170  .format_trace = format_tcp_rx_trace_short,
3171 };
3172 /* *INDENT-ON* */
3173 
3174 /* *INDENT-OFF* */
3176 {
3177  .name = "tcp6-rcv-process",
3178  /* Takes a vector of packets. */
3179  .vector_size = sizeof (u32),
3180  .n_errors = TCP_N_ERROR,
3181  .error_strings = tcp_error_strings,
3182  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
3183  .next_nodes =
3184  {
3185 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
3187 #undef _
3188  },
3189  .format_trace = format_tcp_rx_trace_short,
3190 };
3191 /* *INDENT-ON* */
3192 
3193 /**
3194  * LISTEN state processing as per RFC 793 p. 65
3195  */
3198  vlib_frame_t * from_frame, int is_ip4)
3199 {
3200  u32 n_left_from, *from, n_syns = 0, *first_buffer;
3201  u32 my_thread_index = vm->thread_index;
3202  tcp_connection_t *tc0;
3203 
3204  from = first_buffer = vlib_frame_vector_args (from_frame);
3205  n_left_from = from_frame->n_vectors;
3206 
3207  while (n_left_from > 0)
3208  {
3209  u32 bi0;
3210  vlib_buffer_t *b0;
3211  tcp_rx_trace_t *t0;
3212  tcp_header_t *th0 = 0;
3213  tcp_connection_t *lc0;
3214  ip4_header_t *ip40;
3215  ip6_header_t *ip60;
3216  tcp_connection_t *child0;
3217  u32 error0 = TCP_ERROR_NONE;
3218 
3219  bi0 = from[0];
3220  from += 1;
3221  n_left_from -= 1;
3222 
3223  b0 = vlib_get_buffer (vm, bi0);
3224  lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
3225  if (PREDICT_FALSE (lc0 == 0))
3226  {
3227  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
3228  my_thread_index);
3229  if (tc0->state != TCP_STATE_TIME_WAIT)
3230  {
3231  error0 = TCP_ERROR_CREATE_EXISTS;
3232  goto drop;
3233  }
3234  lc0 = tcp_lookup_listener (b0, tc0->c_fib_index, is_ip4);
3235  /* clean up the old session */
3236  tcp_connection_del (tc0);
3237  }
3238 
3239  if (is_ip4)
3240  {
3241  ip40 = vlib_buffer_get_current (b0);
3242  th0 = tcp_buffer_hdr (b0);
3243  }
3244  else
3245  {
3246  ip60 = vlib_buffer_get_current (b0);
3247  th0 = tcp_buffer_hdr (b0);
3248  }
3249 
3250  /* Create child session. For syn-flood protection use filter */
3251 
3252  /* 1. first check for an RST: handled in dispatch */
3253  /* if (tcp_rst (th0))
3254  goto drop;
3255  */
3256 
3257  /* 2. second check for an ACK: handled in dispatch */
3258  /* if (tcp_ack (th0))
3259  {
3260  tcp_send_reset (b0, is_ip4);
3261  goto drop;
3262  }
3263  */
3264 
3265  /* 3. check for a SYN (did that already) */
3266 
3267  /* Make sure connection wasn't just created */
3268  child0 = tcp_lookup_connection (lc0->c_fib_index, b0, my_thread_index,
3269  is_ip4);
3270  if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN))
3271  {
3272  error0 = TCP_ERROR_CREATE_EXISTS;
3273  goto drop;
3274  }
3275 
3276  /* Create child session and send SYN-ACK */
3277  child0 = tcp_connection_alloc (my_thread_index);
3278  child0->c_lcl_port = th0->dst_port;
3279  child0->c_rmt_port = th0->src_port;
3280  child0->c_is_ip4 = is_ip4;
3281  child0->state = TCP_STATE_SYN_RCVD;
3282  child0->c_fib_index = lc0->c_fib_index;
3283  child0->cc_algo = lc0->cc_algo;
3284 
3285  if (is_ip4)
3286  {
3287  child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
3288  child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
3289  }
3290  else
3291  {
3292  clib_memcpy_fast (&child0->c_lcl_ip6, &ip60->dst_address,
3293  sizeof (ip6_address_t));
3294  clib_memcpy_fast (&child0->c_rmt_ip6, &ip60->src_address,
3295  sizeof (ip6_address_t));
3296  }
3297 
3298  if (tcp_options_parse (th0, &child0->rcv_opts, 1))
3299  {
3300  error0 = TCP_ERROR_OPTIONS;
3301  tcp_connection_free (child0);
3302  goto drop;
3303  }
3304 
3305  child0->irs = vnet_buffer (b0)->tcp.seq_number;
3306  child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
3307  child0->rcv_las = child0->rcv_nxt;
3308  child0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
3309 
3310  /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
3311  * segments are used to initialize PAWS. */
3312  if (tcp_opts_tstamp (&child0->rcv_opts))
3313  {
3314  child0->tsval_recent = child0->rcv_opts.tsval;
3315  child0->tsval_recent_age = tcp_time_now ();
3316  }
3317 
3318  if (tcp_opts_wscale (&child0->rcv_opts))
3319  child0->snd_wscale = child0->rcv_opts.wscale;
3320 
3321  child0->snd_wnd = clib_net_to_host_u16 (th0->window)
3322  << child0->snd_wscale;
3323  child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
3324  child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
3325 
3326  tcp_connection_init_vars (child0);
3327  child0->rto = TCP_RTO_MIN;
3328 
3329  if (session_stream_accept (&child0->connection, lc0->c_s_index,
3330  lc0->c_thread_index, 0 /* notify */ ))
3331  {
3332  tcp_connection_cleanup (child0);
3333  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
3334  goto drop;
3335  }
3336 
3337  TCP_EVT (TCP_EVT_SYN_RCVD, child0, 1);
3338  child0->tx_fifo_size = transport_tx_fifo_size (&child0->connection);
3339  tcp_send_synack (child0);
3340 
3341  drop:
3342 
3343  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
3344  {
3345  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
3346  clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
3347  clib_memcpy_fast (&t0->tcp_connection, lc0,
3348  sizeof (t0->tcp_connection));
3349  }
3350 
3351  n_syns += (error0 == TCP_ERROR_NONE);
3352  }
3353 
3354  tcp_inc_counter (listen, TCP_ERROR_SYNS_RCVD, n_syns);
3355  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
3356 
3357  return from_frame->n_vectors;
3358 }
3359 
3361  vlib_frame_t * from_frame)
3362 {
3363  return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
3364 }
3365 
3367  vlib_frame_t * from_frame)
3368 {
3369  return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
3370 }
3371 
3372 /* *INDENT-OFF* */
3374 {
3375  .name = "tcp4-listen",
3376  /* Takes a vector of packets. */
3377  .vector_size = sizeof (u32),
3378  .n_errors = TCP_N_ERROR,
3379  .error_strings = tcp_error_strings,
3380  .n_next_nodes = TCP_LISTEN_N_NEXT,
3381  .next_nodes =
3382  {
3383 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
3385 #undef _
3386  },
3387  .format_trace = format_tcp_rx_trace_short,
3388 };
3389 /* *INDENT-ON* */
3390 
3391 /* *INDENT-OFF* */
3393 {
3394  .name = "tcp6-listen",
3395  /* Takes a vector of packets. */
3396  .vector_size = sizeof (u32),
3397  .n_errors = TCP_N_ERROR,
3398  .error_strings = tcp_error_strings,
3399  .n_next_nodes = TCP_LISTEN_N_NEXT,
3400  .next_nodes =
3401  {
3402 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
3404 #undef _
3405  },
3406  .format_trace = format_tcp_rx_trace_short,
3407 };
3408 /* *INDENT-ON* */
3409 
3410 typedef enum _tcp_input_next
3411 {
3421 
3422 #define foreach_tcp4_input_next \
3423  _ (DROP, "ip4-drop") \
3424  _ (LISTEN, "tcp4-listen") \
3425  _ (RCV_PROCESS, "tcp4-rcv-process") \
3426  _ (SYN_SENT, "tcp4-syn-sent") \
3427  _ (ESTABLISHED, "tcp4-established") \
3428  _ (RESET, "tcp4-reset") \
3429  _ (PUNT, "ip4-punt")
3430 
3431 #define foreach_tcp6_input_next \
3432  _ (DROP, "ip6-drop") \
3433  _ (LISTEN, "tcp6-listen") \
3434  _ (RCV_PROCESS, "tcp6-rcv-process") \
3435  _ (SYN_SENT, "tcp6-syn-sent") \
3436  _ (ESTABLISHED, "tcp6-established") \
3437  _ (RESET, "tcp6-reset") \
3438  _ (PUNT, "ip6-punt")
3439 
3440 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
3441 
3442 static void
3444  vlib_buffer_t ** bs, u32 n_bufs, u8 is_ip4)
3445 {
3446  tcp_connection_t *tc;
3447  tcp_header_t *tcp;
3448  tcp_rx_trace_t *t;
3449  int i;
3450 
3451  for (i = 0; i < n_bufs; i++)
3452  {
3453  if (bs[i]->flags & VLIB_BUFFER_IS_TRACED)
3454  {
3455  t = vlib_add_trace (vm, node, bs[i], sizeof (*t));
3456  tc = tcp_connection_get (vnet_buffer (bs[i])->tcp.connection_index,
3457  vm->thread_index);
3458  tcp = vlib_buffer_get_current (bs[i]);
3459  tcp_set_rx_trace_data (t, tc, tcp, bs[i], is_ip4);
3460  }
3461  }
3462 }
3463 
3464 static void
3466 {
3467  if (*error == TCP_ERROR_FILTERED || *error == TCP_ERROR_WRONG_THREAD)
3468  {
3469  *next = TCP_INPUT_NEXT_DROP;
3470  }
3471  else if ((is_ip4 && tm->punt_unknown4) || (!is_ip4 && tm->punt_unknown6))
3472  {
3473  *next = TCP_INPUT_NEXT_PUNT;
3474  *error = TCP_ERROR_PUNT;
3475  }
3476  else
3477  {
3478  *next = TCP_INPUT_NEXT_RESET;
3479  *error = TCP_ERROR_NO_LISTENER;
3480  }
3481 }
3482 
3484 tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error,
3485  u8 is_ip4, u8 is_nolookup)
3486 {
3487  u32 fib_index = vnet_buffer (b)->ip.fib_index;
3488  int n_advance_bytes, n_data_bytes;
3490  tcp_header_t *tcp;
3491  u8 result = 0;
3492 
3493  if (is_ip4)
3494  {
3496  int ip_hdr_bytes = ip4_header_bytes (ip4);
3497  if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp)))
3498  {
3499  *error = TCP_ERROR_LENGTH;
3500  return 0;
3501  }
3502  tcp = ip4_next_header (ip4);
3503  vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4;
3504  n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp));
3505  n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes;
3506 
3507  /* Length check. Checksum computed by ipx_local no need to compute again */
3508  if (PREDICT_FALSE (n_data_bytes < 0))
3509  {
3510  *error = TCP_ERROR_LENGTH;
3511  return 0;
3512  }
3513 
3514  if (!is_nolookup)
3515  tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
3516  &ip4->src_address, tcp->dst_port,
3517  tcp->src_port,
3518  TRANSPORT_PROTO_TCP, thread_index,
3519  &result);
3520  }
3521  else
3522  {
3524  if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp)))
3525  {
3526  *error = TCP_ERROR_LENGTH;
3527  return 0;
3528  }
3529  tcp = ip6_next_header (ip6);
3530  vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6;
3531  n_advance_bytes = tcp_header_bytes (tcp);
3532  n_data_bytes = clib_net_to_host_u16 (ip6->payload_length)
3533  - n_advance_bytes;
3534  n_advance_bytes += sizeof (ip6[0]);
3535 
3536  if (PREDICT_FALSE (n_data_bytes < 0))
3537  {
3538  *error = TCP_ERROR_LENGTH;
3539  return 0;
3540  }
3541 
3542  if (!is_nolookup)
3543  {
3544  if (PREDICT_FALSE
3546  {
3547  ip4_main_t *im = &ip4_main;
3548  fib_index = vec_elt (im->fib_index_by_sw_if_index,
3550  }
3551 
3552  tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
3553  &ip6->src_address,
3554  tcp->dst_port, tcp->src_port,
3555  TRANSPORT_PROTO_TCP,
3556  thread_index, &result);
3557  }
3558  }
3559 
3560  if (is_nolookup)
3561  tc =
3563  tcp.connection_index,
3564  thread_index);
3565 
3566  vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number);
3567  vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number);
3568  vnet_buffer (b)->tcp.data_offset = n_advance_bytes;
3569  vnet_buffer (b)->tcp.data_len = n_data_bytes;
3570  vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number
3571  + n_data_bytes;
3572  vnet_buffer (b)->tcp.flags = 0;
3573 
3574  *error = result ? TCP_ERROR_NONE + result : *error;
3575 
3577 }
3578 
3579 static inline void
3581  vlib_buffer_t * b, u16 * next, u32 * error)
3582 {
3583  tcp_header_t *tcp;
3584  u8 flags;
3585 
3586  tcp = tcp_buffer_hdr (b);
3587  flags = tcp->flags & filter_flags;
3588  *next = tm->dispatch_table[tc->state][flags].next;
3589  *error = tm->dispatch_table[tc->state][flags].error;
3590  tc->segs_in += 1;
3591 
3592  if (PREDICT_FALSE (*error == TCP_ERROR_DISPATCH
3593  || *next == TCP_INPUT_NEXT_RESET))
3594  {
3595  /* Overload tcp flags to store state */
3596  tcp_state_t state = tc->state;
3597  vnet_buffer (b)->tcp.flags = tc->state;
3598 
3599  if (*error == TCP_ERROR_DISPATCH)
3600  clib_warning ("tcp conn %u disp error state %U flags %U",
3601  tc->c_c_index, format_tcp_state, state,
3602  format_tcp_flags, (int) flags);
3603  }
3604 }
3605 
3608  vlib_frame_t * frame, int is_ip4, u8 is_nolookup)
3609 {
3610  u32 n_left_from, *from, thread_index = vm->thread_index;
3611  tcp_main_t *tm = vnet_get_tcp_main ();
3612  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
3613  u16 nexts[VLIB_FRAME_SIZE], *next;
3614 
3615  tcp_set_time_now (tcp_get_worker (thread_index));
3616 
3617  from = vlib_frame_vector_args (frame);
3618  n_left_from = frame->n_vectors;
3619  vlib_get_buffers (vm, from, bufs, n_left_from);
3620 
3621  b = bufs;
3622  next = nexts;
3623 
3624  while (n_left_from >= 4)
3625  {
3626  u32 error0 = TCP_ERROR_NO_LISTENER, error1 = TCP_ERROR_NO_LISTENER;
3627  tcp_connection_t *tc0, *tc1;
3628 
3629  {
3630  vlib_prefetch_buffer_header (b[2], STORE);
3631  CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
3632 
3633  vlib_prefetch_buffer_header (b[3], STORE);
3634  CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
3635  }
3636 
3637  next[0] = next[1] = TCP_INPUT_NEXT_DROP;
3638 
3639  tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
3640  is_nolookup);
3641  tc1 = tcp_input_lookup_buffer (b[1], thread_index, &error1, is_ip4,
3642  is_nolookup);
3643 
3644  if (PREDICT_TRUE (!tc0 + !tc1 == 0))
3645  {
3646  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
3647  ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
3648 
3649  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
3650  vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
3651 
3652  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
3653  tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
3654  }
3655  else
3656  {
3657  if (PREDICT_TRUE (tc0 != 0))
3658  {
3659  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
3660  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
3661  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
3662  }
3663  else
3664  tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
3665 
3666  if (PREDICT_TRUE (tc1 != 0))
3667  {
3668  ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
3669  vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
3670  tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
3671  }
3672  else
3673  tcp_input_set_error_next (tm, &next[1], &error1, is_ip4);
3674  }
3675 
3676  b += 2;
3677  next += 2;
3678  n_left_from -= 2;
3679  }
3680  while (n_left_from > 0)
3681  {
3682  tcp_connection_t *tc0;
3683  u32 error0 = TCP_ERROR_NO_LISTENER;
3684 
3685  if (n_left_from > 1)
3686  {
3687  vlib_prefetch_buffer_header (b[1], STORE);
3688  CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
3689  }
3690 
3691  next[0] = TCP_INPUT_NEXT_DROP;
3692  tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
3693  is_nolookup);
3694  if (PREDICT_TRUE (tc0 != 0))
3695  {
3696  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
3697  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
3698  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
3699  }
3700  else
3701  tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
3702 
3703  b += 1;
3704  next += 1;
3705  n_left_from -= 1;
3706  }
3707 
3709  tcp_input_trace_frame (vm, node, bufs, frame->n_vectors, is_ip4);
3710 
3711  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
3712  return frame->n_vectors;
3713 }
3714 
3717  vlib_frame_t * from_frame)
3718 {
3719  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
3720  1 /* is_nolookup */ );
3721 }
3722 
3725  vlib_frame_t * from_frame)
3726 {
3727  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
3728  1 /* is_nolookup */ );
3729 }
3730 
3731 /* *INDENT-OFF* */
3733 {
3734  .name = "tcp4-input-nolookup",
3735  /* Takes a vector of packets. */
3736  .vector_size = sizeof (u32),
3737  .n_errors = TCP_N_ERROR,
3738  .error_strings = tcp_error_strings,
3739  .n_next_nodes = TCP_INPUT_N_NEXT,
3740  .next_nodes =
3741  {
3742 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3744 #undef _
3745  },
3746  .format_buffer = format_tcp_header,
3747  .format_trace = format_tcp_rx_trace,
3748 };
3749 /* *INDENT-ON* */
3750 
3751 /* *INDENT-OFF* */
3753 {
3754  .name = "tcp6-input-nolookup",
3755  /* Takes a vector of packets. */
3756  .vector_size = sizeof (u32),
3757  .n_errors = TCP_N_ERROR,
3758  .error_strings = tcp_error_strings,
3759  .n_next_nodes = TCP_INPUT_N_NEXT,
3760  .next_nodes =
3761  {
3762 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3764 #undef _
3765  },
3766  .format_buffer = format_tcp_header,
3767  .format_trace = format_tcp_rx_trace,
3768 };
3769 /* *INDENT-ON* */
3770 
3772  vlib_frame_t * from_frame)
3773 {
3774  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
3775  0 /* is_nolookup */ );
3776 }
3777 
3779  vlib_frame_t * from_frame)
3780 {
3781  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
3782  0 /* is_nolookup */ );
3783 }
3784 
3785 /* *INDENT-OFF* */
3787 {
3788  .name = "tcp4-input",
3789  /* Takes a vector of packets. */
3790  .vector_size = sizeof (u32),
3791  .n_errors = TCP_N_ERROR,
3792  .error_strings = tcp_error_strings,
3793  .n_next_nodes = TCP_INPUT_N_NEXT,
3794  .next_nodes =
3795  {
3796 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3798 #undef _
3799  },
3800  .format_buffer = format_tcp_header,
3801  .format_trace = format_tcp_rx_trace,
3802 };
3803 /* *INDENT-ON* */
3804 
3805 /* *INDENT-OFF* */
3807 {
3808  .name = "tcp6-input",
3809  /* Takes a vector of packets. */
3810  .vector_size = sizeof (u32),
3811  .n_errors = TCP_N_ERROR,
3812  .error_strings = tcp_error_strings,
3813  .n_next_nodes = TCP_INPUT_N_NEXT,
3814  .next_nodes =
3815  {
3816 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3818 #undef _
3819  },
3820  .format_buffer = format_tcp_header,
3821  .format_trace = format_tcp_rx_trace,
3822 };
3823 /* *INDENT-ON* */
3824 
3825 #ifndef CLIB_MARCH_VARIANT
3826 static void
3828 {
3829  int i, j;
3830  for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
3831  for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
3832  {
3833  tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
3834  tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
3835  }
3836 
3837 #define _(t,f,n,e) \
3838 do { \
3839  tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
3840  tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
3841 } while (0)
3842 
3843  /* RFC 793: In LISTEN if RST drop and if ACK return RST */
3844  _(LISTEN, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3845  _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_ACK_INVALID);
3846  _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_INVALID_CONNECTION);
3847  _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
3849  TCP_ERROR_ACK_INVALID);
3851  TCP_ERROR_SEGMENT_INVALID);
3853  TCP_ERROR_SEGMENT_INVALID);
3855  TCP_ERROR_INVALID_CONNECTION);
3856  _(LISTEN, TCP_FLAG_FIN, TCP_INPUT_NEXT_RESET, TCP_ERROR_SEGMENT_INVALID);
3858  TCP_ERROR_SEGMENT_INVALID);
3860  TCP_ERROR_SEGMENT_INVALID);
3862  TCP_ERROR_NONE);
3864  TCP_ERROR_SEGMENT_INVALID);
3866  TCP_ERROR_SEGMENT_INVALID);
3868  TCP_ERROR_SEGMENT_INVALID);
3870  TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3871  /* ACK for for a SYN-ACK -> tcp-rcv-process. */
3872  _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3873  _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3875  TCP_ERROR_NONE);
3876  _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3878  TCP_ERROR_NONE);
3880  TCP_ERROR_NONE);
3881  _(SYN_RCVD, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3882  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3883  _(SYN_RCVD, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3885  TCP_ERROR_NONE);
3887  TCP_ERROR_NONE);
3888  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3889  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3891  TCP_ERROR_NONE);
3892  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3893  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3894  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3895  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3897  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3898  _(SYN_RCVD, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3899  /* SYN-ACK for a SYN */
3901  TCP_ERROR_NONE);
3902  _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3903  _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3905  TCP_ERROR_NONE);
3906  _(SYN_SENT, TCP_FLAG_FIN, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3908  TCP_ERROR_NONE);
3909  /* ACK for for established connection -> tcp-established. */
3910  _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3911  /* FIN for for established connection -> tcp-established. */
3912  _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3914  TCP_ERROR_NONE);
3916  TCP_ERROR_NONE);
3917  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3918  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3920  TCP_ERROR_NONE);
3921  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3922  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3923  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3924  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3925  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3926  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3927  _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3929  TCP_ERROR_NONE);
3930  _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3932  TCP_ERROR_NONE);
3934  TCP_ERROR_NONE);
3935  _(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3936  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3937  _(ESTABLISHED, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3938  /* ACK or FIN-ACK to our FIN */
3939  _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3941  TCP_ERROR_NONE);
3942  /* FIN in reply to our FIN from the other side */
3943  _(FIN_WAIT_1, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3944  _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3946  TCP_ERROR_NONE);
3947  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3948  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3949  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3950  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3951  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3952  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3954  TCP_ERROR_NONE);
3955  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3956  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3957  _(FIN_WAIT_1, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3959  TCP_ERROR_NONE);
3961  TCP_ERROR_NONE);
3962  _(FIN_WAIT_1, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3963  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3964  _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3966  TCP_ERROR_NONE);
3967  _(CLOSING, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3968  _(CLOSING, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3969  _(CLOSING, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3971  TCP_ERROR_NONE);
3973  TCP_ERROR_NONE);
3974  _(CLOSING, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3975  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3976  _(CLOSING, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3978  TCP_ERROR_NONE);
3979  _(CLOSING, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3981  TCP_ERROR_NONE);
3983  TCP_ERROR_NONE);
3984  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3985  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3987  TCP_ERROR_NONE);
3988  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3989  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3990  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3991  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3993  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3994  /* FIN confirming that the peer (app) has closed */
3995  _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3996  _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3998  TCP_ERROR_NONE);
3999  _(FIN_WAIT_2, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4001  TCP_ERROR_NONE);
4002  _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4004  TCP_ERROR_NONE);
4005  _(CLOSE_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4007  TCP_ERROR_NONE);
4008  _(LAST_ACK, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
4009  _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4010  _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4012  TCP_ERROR_NONE);
4014  TCP_ERROR_NONE);
4015  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
4016  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4018  TCP_ERROR_NONE);
4019  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
4020  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4021  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
4022  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4024  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4025  _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4027  TCP_ERROR_NONE);
4028  _(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4030  TCP_ERROR_NONE);
4032  TCP_ERROR_NONE);
4033  _(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
4034  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4035  _(TIME_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
4036  _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4038  TCP_ERROR_NONE);
4039  _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4041  TCP_ERROR_NONE);
4042  _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
4043  /* RFC793 CLOSED: An incoming segment containing a RST is discarded. An
4044  * incoming segment not containing a RST causes a RST to be sent in
4045  * response.*/
4046  _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
4048  TCP_ERROR_CONNECTION_CLOSED);
4049  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
4050  _(CLOSED, TCP_FLAG_SYN, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
4052  TCP_ERROR_NONE);
4053 #undef _
4054 }
4055 
4056 static clib_error_t *
4058 {
4059  clib_error_t *error = 0;
4060  tcp_main_t *tm = vnet_get_tcp_main ();
4061 
4062  if ((error = vlib_call_init_function (vm, tcp_init)))
4063  return error;
4064 
4065  /* Initialize dispatch table. */
4067 
4068  return error;
4069 }
4070 
4072 
4073 #endif /* CLIB_MARCH_VARIANT */
4074 
4075 /*
4076  * fd.io coding-style-patch-verification: ON
4077  *
4078  * Local Variables:
4079  * eval: (c-set-style "gnu")
4080  * End:
4081  */
static void tcp_program_disconnect(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:1667
#define tcp_in_cong_recovery(tc)
Definition: tcp.h:474
static int tcp_session_enqueue_ooo(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue out-of-order data.
Definition: tcp_input.c:1850
static void tcp_update_timestamp(tcp_connection_t *tc, u32 seq, u32 seq_end)
Update tsval recent.
Definition: tcp_input.c:251
u16 lb_n_buckets
number of buckets in the load-balance.
Definition: load_balance.h:116
static sack_scoreboard_hole_t * scoreboard_insert_hole(sack_scoreboard_t *sb, u32 prev_index, u32 start, u32 end)
Definition: tcp_input.c:724
static u8 tcp_scoreboard_is_sane_post_recovery(tcp_connection_t *tc)
Test that scoreboard is sane after recovery.
Definition: tcp_input.c:970
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:124
void scoreboard_clear(sack_scoreboard_t *sb)
Definition: tcp_input.c:927
u32 connection_index
Index of the transport connection associated to the session.
void tcp_program_retransmit(tcp_connection_t *tc)
Definition: tcp_output.c:1198
End of options.
Definition: tcp_packet.h:104
#define clib_min(x, y)
Definition: clib.h:295
#define CLIB_UNUSED(x)
Definition: clib.h:82
u32 * pending_disconnects
vector of pending disconnect notifications
Definition: tcp.h:522
vlib_node_registration_t tcp6_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp6_rcv_process_node)
Definition: tcp_input.c:3175
static u32 ip6_fib_table_fwding_lookup(u32 fib_index, const ip6_address_t *dst)
Definition: ip6_fib.h:67
#define tcp_in_recovery(tc)
Definition: tcp.h:465
static f64 tcp_time_now_us(u32 thread_index)
Definition: tcp.h:1028
static void tcp_rcv_fin(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b, u32 *error)
Definition: tcp_input.c:1698
#define TCP_OPTION_LEN_SACK_PERMITTED
Definition: tcp_packet.h:166
#define seq_leq(_s1, _s2)
Definition: tcp.h:874
struct _sack_block sack_block_t
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
Definition: tcp_input.c:981
static void vlib_buffer_free(vlib_main_t *vm, u32 *buffers, u32 n_buffers)
Free buffers Frees the entire buffer chain for each buffer.
Definition: buffer_funcs.h:890
#define timestamp_leq(_t1, _t2)
Definition: tcp.h:881
ip4_address_t src_address
Definition: ip4_packet.h:170
static u8 tcp_cc_is_spurious_retransmit(tcp_connection_t *tc)
Definition: tcp_input.c:1287
transport_connection_t * session_lookup_connection_wt6(u32 fib_index, ip6_address_t *lcl, ip6_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index, u8 *result)
Lookup connection with ip6 and transport layer information.
vnet_main_t * vnet_get_main(void)
Definition: misc.c:46
enum _tcp_state_next tcp_state_next_t
static vnet_hw_interface_t * vnet_get_sup_hw_interface(vnet_main_t *vnm, u32 sw_if_index)
#define tcp_rst(_th)
Definition: tcp_packet.h:81
Selective Ack permitted.
Definition: tcp_packet.h:108
#define TCP_FLAG_SYN
Definition: fa_node.h:13
#define tcp_opts_tstamp(_to)
Definition: tcp_packet.h:156
#define PREDICT_TRUE(x)
Definition: clib.h:112
#define tcp_inc_err_counter(cnts, err, val)
Definition: tcp_input.c:2132
unsigned long u64
Definition: types.h:89
#define tcp_store_err_counters(node_id, cnts)
Definition: tcp_input.c:2136
static void tcp_dispatch_table_init(tcp_main_t *tm)
Definition: tcp_input.c:3827
#define clib_memcpy_fast(a, b, c)
Definition: string.h:81
static u8 * format_tcp_rx_trace_short(u8 *s, va_list *args)
Definition: tcp_input.c:2043
static int tcp_segment_rcv(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b)
Receive buffer for connection and handle acks.
Definition: tcp_input.c:1954
clib_memset(h->entries, 0, sizeof(h->entries[0]) *entries)
struct _sack_scoreboard sack_scoreboard_t
static uword tcp46_established_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4)
Definition: tcp_input.c:2146
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp.h:777
void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_input.c:478
svm_fifo_t * rx_fifo
Pointers to rx/tx buffers.
#define tcp_doff(_th)
Definition: tcp_packet.h:78
struct _tcp_main tcp_main_t
u32 thread_index
Definition: main.h:218
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
Definition: tcp.c:519
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:113
int session_main_flush_enqueue_events(u8 transport_proto, u32 thread_index)
Flushes queue of sessions that are to be notified of new data enqueued events.
Definition: session.c:659
u8 data[0]
Packet data.
Definition: buffer.h:181
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:523
#define tcp_recovery_off(tc)
Definition: tcp.h:463
#define clib_abs(x)
Definition: clib.h:302
u32 dpo_get_urpf(const dpo_id_t *dpo)
Get a uRPF interface for the DPO.
Definition: dpo.c:382
#define vec_add2(V, P, N)
Add N elements to end of vector V, return pointer to new elements in P.
Definition: vec.h:561
int i
#define THZ
TCP tick frequency.
Definition: tcp.h:28
static u32 format_get_indent(u8 *s)
Definition: format.h:72
vlib_node_registration_t tcp4_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp4_rcv_process_node)
Definition: tcp_input.c:3156
u32 * fib_index_by_sw_if_index
Table index indexed by software interface.
Definition: ip4.h:121
struct _tcp_connection tcp_connection_t
static session_t * session_get(u32 si, u32 thread_index)
Definition: session.h:295
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:424
static u32 tcp_available_cc_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
Definition: tcp.h:977
#define tcp_opts_sack(_to)
Definition: tcp_packet.h:158
tcp_connection_t tcp_connection
Definition: tcp_input.c:2023
static u8 tcp_sack_vector_is_sane(sack_block_t *sacks)
Definition: tcp_input.c:1720
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
Definition: tcp.h:736
#define VLIB_NODE_FN(node)
Definition: node.h:202
static void tcp_cc_congestion_undo(tcp_connection_t *tc)
Definition: tcp_input.c:1268
#define tcp_disconnect_pending_on(tc)
Definition: tcp.h:468
int session_enqueue_stream_connection(transport_connection_t *tc, vlib_buffer_t *b, u32 offset, u8 queue_event, u8 is_in_order)
Definition: session.c:414
u64 session_lookup_half_open_handle(transport_connection_t *tc)
No operation.
Definition: tcp_packet.h:105
format_function_t format_tcp_flags
Definition: tcp.h:65
#define pool_get(P, E)
Allocate an object E from a pool P (unspecified alignment).
Definition: pool.h:237
u8 n_sack_blocks
Number of SACKs blocks.
Definition: tcp_packet.h:151
struct _tcp_header tcp_header_t
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
Definition: tcp.c:210
ip6_address_t src_address
Definition: ip6_packet.h:307
void scoreboard_clear_reneging(sack_scoreboard_t *sb, u32 start, u32 end)
Definition: tcp_input.c:946
u32 * pending_deq_acked
vector of pending ack dequeues
Definition: tcp.h:519
unsigned char u8
Definition: types.h:56
#define tcp_inc_counter(node_id, err, count)
Definition: tcp_input.c:2124
vlib_node_registration_t tcp6_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp6_syn_sent_node)
Definition: tcp_input.c:2742
struct _sack_scoreboard_hole sack_scoreboard_hole_t
u8 wscale
Option flags, see above.
Definition: tcp_packet.h:146
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
static tcp_connection_t * tcp_lookup_connection(u32 fib_index, vlib_buffer_t *b, u8 thread_index, u8 is_ip4)
Lookup transport connection.
Definition: tcp_input.c:2350
double f64
Definition: types.h:142
#define tcp_fastrecovery_on(tc)
Definition: tcp.h:460
Limit MSS.
Definition: tcp_packet.h:106
void session_transport_closing_notify(transport_connection_t *tc)
Notification from transport that connection is being closed.
Definition: session.c:858
sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
Definition: tcp_input.c:649
#define TCP_TICK
TCP tick period (s)
Definition: tcp.h:27
void scoreboard_init_rxt(sack_scoreboard_t *sb, u32 snd_una)
Definition: tcp_input.c:905
#define tcp_is_fin(_th)
Definition: tcp_packet.h:90
#define seq_gt(_s1, _s2)
Definition: tcp.h:875
static u8 * format_tcp_rx_trace(u8 *s, va_list *args)
Definition: tcp_input.c:2027
static void tcp_connection_set_state(tcp_connection_t *tc, tcp_state_t state)
Definition: tcp.h:742
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
Definition: tcp.c:691
#define tcp_cfg
Definition: tcp.h:679
vl_api_interface_index_t sw_if_index
Definition: gre.api:59
u8 * format_tcp_connection_id(u8 *s, va_list *args)
Definition: tcp.c:1040
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:173
vlib_node_registration_t tcp4_established_node
(constructor) VLIB_REGISTER_NODE (tcp4_established_node)
Definition: tcp_input.c:2244
#define TCP_OPTION_LEN_SACK_BLOCK
Definition: tcp_packet.h:168
ip4_address_t dst_address
Definition: ip4_packet.h:170
#define TCP_FLAG_ACK
Definition: fa_node.h:16
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:129
transport_connection_t * session_lookup_connection_wt4(u32 fib_index, ip4_address_t *lcl, ip4_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index, u8 *result)
Lookup connection with ip4 and transport layer information.
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp.h:696
vnet_hw_interface_flags_t flags
Definition: interface.h:523
#define vlib_prefetch_buffer_header(b, type)
Prefetch buffer metadata.
Definition: buffer.h:203
static int tcp_segment_validate(tcp_worker_ctx_t *wrk, tcp_connection_t *tc0, vlib_buffer_t *b0, tcp_header_t *th0, u32 *error0)
Validate incoming segment as per RFC793 p.
Definition: tcp_input.c:279
enum _tcp_state tcp_state_t
#define TCP_ALWAYS_ACK
On/off delayed acks.
Definition: tcp.h:39
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
Definition: tcp_input.c:3806
static u8 tcp_ack_is_dupack(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una)
Check if duplicate ack as per RFC5681 Sec.
Definition: tcp_input.c:1539
#define TCP_RTO_MAX
Definition: tcp.h:99
static u32 ooo_segment_length(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:723
static void * ip4_next_header(ip4_header_t *i)
Definition: ip4_packet.h:241
static u32 tcp_time_now(void)
Definition: tcp.h:1006
sack_block_t * sacks
SACK blocks.
Definition: tcp_packet.h:150
unsigned int u32
Definition: types.h:88
#define vec_end(v)
End (last data address) of vector.
#define vlib_call_init_function(vm, x)
Definition: init.h:270
static void tcp_node_inc_counter_i(vlib_main_t *vm, u32 tcp4_node, u32 tcp6_node, u8 is_ip4, u32 evt, u32 val)
Definition: tcp_input.c:2108
#define TCP_MAX_SACK_BLOCKS
Max number of SACK blocks stored.
Definition: tcp.h:163
#define VLIB_FRAME_SIZE
Definition: node.h:378
static void tcp_cc_init_congestion(tcp_connection_t *tc)
Init loss recovery/fast recovery.
Definition: tcp_input.c:1242
#define tcp_validate_txf_size(_tc, _a)
Definition: tcp.h:1218
static int tcp_options_parse(tcp_header_t *th, tcp_options_t *to, u8 is_syn)
Parse TCP header options.
Definition: tcp_input.c:127
#define timestamp_lt(_t1, _t2)
Definition: tcp.h:880
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:1112
#define TCP_OPTION_LEN_WINDOW_SCALE
Definition: tcp_packet.h:165
static void svm_fifo_newest_ooo_segment_reset(svm_fifo_t *f)
Definition: svm_fifo.h:707
static heap_elt_t * first(heap_header_t *h)
Definition: heap.c:59
void scoreboard_init(sack_scoreboard_t *sb)
Definition: tcp_input.c:919
The identity of a DPO is a combination of its type and its instance number/index of objects of that t...
Definition: dpo.h:170
static u8 tcp_should_fastrecover(tcp_connection_t *tc, u8 has_sack)
Definition: tcp_input.c:1301
vlib_main_t * vm
convenience pointer to this thread&#39;s vlib main
Definition: tcp.h:525
#define TCP_INVALID_SACK_HOLE_INDEX
Definition: tcp.h:164
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:519
static void tcp_program_dequeue(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:624
void tcp_send_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1157
static void tcp_handle_disconnects(tcp_worker_ctx_t *wrk)
Definition: tcp_input.c:1677
static uword tcp46_listen_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
LISTEN state processing as per RFC 793 p.
Definition: tcp_input.c:3197
void tcp_connection_tx_pacer_reset(tcp_connection_t *tc, u32 window, u32 start_bucket)
Definition: tcp.c:1409
static void tcp_input_set_error_next(tcp_main_t *tm, u16 *next, u32 *error, u8 is_ip4)
Definition: tcp_input.c:3465
tcp_connection_t * tcp_connection_alloc_w_base(u8 thread_index, tcp_connection_t *base)
Definition: tcp.c:311
static const dpo_id_t * load_balance_get_bucket_i(const load_balance_t *lb, u32 bucket)
Definition: load_balance.h:229
vlib_node_registration_t tcp4_input_nolookup_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_nolookup_node)
Definition: tcp_input.c:3732
unsigned short u16
Definition: types.h:57
#define foreach_tcp4_input_next
Definition: tcp_input.c:3422
tcp_connection_t * tcp_connection_alloc(u8 thread_index)
Definition: tcp.c:298
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:229
#define filter_flags
Definition: tcp_input.c:3440
void tcp_connection_tx_pacer_update(tcp_connection_t *tc)
Definition: tcp.c:1396
#define pool_put(P, E)
Free an object E in pool P.
Definition: pool.h:287
static int tcp_buffer_discard_bytes(vlib_buffer_t *b, u32 n_bytes_to_drop)
Definition: tcp_input.c:1921
static void tcp_check_tx_offload(tcp_connection_t *tc, int is_ipv4)
Definition: tcp_input.c:2421
#define foreach_tcp6_input_next
Definition: tcp_input.c:3431
#define TCP_TIMER_HANDLE_INVALID
Definition: tcp.h:92
The FIB DPO provieds;.
Definition: load_balance.h:106
static void tcp_input_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t **bs, u32 n_bufs, u8 is_ip4)
Definition: tcp_input.c:3443
int ip6_address_compare(ip6_address_t *a1, ip6_address_t *a2)
Definition: ip46_cli.c:60
static void tcp_cc_rcv_cong_ack(tcp_connection_t *tc, tcp_cc_ack_t ack_type, tcp_rate_sample_t *rs)
Definition: tcp.h:1060
#define PREDICT_FALSE(x)
Definition: clib.h:111
#define always_inline
Definition: ipsec.h:28
static int tcp_rcv_ack_no_cc(tcp_connection_t *tc, vlib_buffer_t *b, u32 *error)
Definition: tcp_input.c:421
#define vec_del1(v, i)
Delete the element at index I.
Definition: vec.h:806
#define TCP_FLAG_FIN
Definition: fa_node.h:12
static void tcp_cc_handle_event(tcp_connection_t *tc, tcp_rate_sample_t *rs, u32 is_dack)
One function to rule them all ...
Definition: tcp_input.c:1404
vlib_node_registration_t tcp4_listen_node
(constructor) VLIB_REGISTER_NODE (tcp4_listen_node)
Definition: tcp_input.c:3373
#define TCP_OPTION_LEN_TIMESTAMP
Definition: tcp_packet.h:167
vlib_main_t * vm
Definition: in2out_ed.c:1810
static ooo_segment_t * svm_fifo_newest_ooo_segment(svm_fifo_t *f)
Definition: svm_fifo.h:699
u32 tcp_sack_list_bytes(tcp_connection_t *tc)
Definition: tcp_input.c:1792
Selective Ack block.
Definition: tcp_packet.h:109
vlib_node_registration_t tcp6_established_node
(constructor) VLIB_REGISTER_NODE (tcp6_established_node)
Definition: tcp_input.c:2263
sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
Definition: tcp_input.c:673
static int tcp_can_delack(tcp_connection_t *tc)
Check if ACK could be delayed.
Definition: tcp_input.c:1905
static void vlib_node_increment_counter(vlib_main_t *vm, u32 node_index, u32 counter_index, u64 increment)
Definition: node_funcs.h:1150
static int tcp_cc_recover(tcp_connection_t *tc)
Definition: tcp_input.c:1329
#define TCP_FLAG_RST
Definition: fa_node.h:14
#define TCP_DBG(_fmt, _args...)
Definition: tcp_debug.h:146
static int tcp_rcv_ack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *th, u32 *error)
Process incoming ACK.
Definition: tcp_input.c:1567
#define TCP_MAX_WND_SCALE
Definition: tcp_packet.h:172
void tcp_connection_free(tcp_connection_t *tc)
Definition: tcp.c:324
u8 is_ip4
Definition: lisp_gpe.api:232
u8 ip6[16]
Definition: one.api:477
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:169
vlib_node_registration_t tcp4_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp4_syn_sent_node)
Definition: tcp_input.c:2723
u32 flags
Definition: vhost_user.h:141
u16 n_vectors
Definition: node.h:397
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:80
int ip4_address_compare(ip4_address_t *a1, ip4_address_t *a2)
Definition: ip46_cli.c:53
static_always_inline void vlib_buffer_enqueue_to_next(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 *nexts, uword count)
Definition: buffer_node.h:332
static void tcp_set_rx_trace_data(tcp_rx_trace_t *t0, tcp_connection_t *tc0, tcp_header_t *th0, vlib_buffer_t *b0, u8 is_ip4)
Definition: tcp_input.c:2058
void tcp_program_dupack(tcp_connection_t *tc)
Definition: tcp_output.c:1186
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
Definition: tcp_output.c:863
#define TCP_DUPACK_THRESHOLD
Definition: tcp.h:37
static u32 tcp_tstamp(tcp_connection_t *tc)
Generate timestamp for tcp connection.
Definition: tcp.h:1021
static tcp_connection_t * tcp_input_lookup_buffer(vlib_buffer_t *b, u8 thread_index, u32 *error, u8 is_ip4, u8 is_nolookup)
Definition: tcp_input.c:3484
format_function_t format_tcp_state
Definition: tcp.h:64
static void tcp_cc_undo_recovery(tcp_connection_t *tc)
Definition: tcp.h:1085
static void scoreboard_update_bytes(sack_scoreboard_t *sb, u32 ack, u32 snd_mss)
Definition: tcp_input.c:772
#define clib_warning(format, args...)
Definition: error.h:59
Don&#39;t register connection in lookup Does not apply to local apps and transports using the network lay...
tcp_header_t tcp_header
Definition: tcp_input.c:2022
format_function_t format_tcp_header
Definition: format.h:100
struct _transport_connection transport_connection_t
f64 rtt_time
RTT for sample.
Definition: tcp.h:282
#define pool_is_free_index(P, I)
Use free bitmap to query whether given index is free.
Definition: pool.h:284
#define ARRAY_LEN(x)
Definition: clib.h:62
#define TCP_RTT_MAX
Definition: tcp.h:101
u16 mss
Maximum segment size advertised.
Definition: tcp_packet.h:147
vlib_main_t vlib_node_runtime_t * node
Definition: in2out_ed.c:1810
static void * ip6_next_header(ip6_header_t *i)
Definition: ip6_packet.h:368
static u32 transport_max_tx_dequeue(transport_connection_t *tc)
Definition: session.h:478
void tcp_send_synack(tcp_connection_t *tc)
Definition: tcp_output.c:954
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:1136
#define TCP_PAWS_IDLE
24 days
Definition: tcp.h:30
vslo right
#define ASSERT(truth)
#define tcp_syn(_th)
Definition: tcp_packet.h:80
static clib_error_t * tcp_input_init(vlib_main_t *vm)
Definition: tcp_input.c:4057
#define tcp_fastrecovery_first_on(tc)
Definition: tcp.h:471
static void tcp_estimate_rtt(tcp_connection_t *tc, u32 mrtt)
Compute smoothed RTT as per VJ&#39;s &#39;88 SIGCOMM and RFC6298.
Definition: tcp_input.c:454
u8 data[128]
Definition: ipsec_types.api:87
static int tcp_update_rtt(tcp_connection_t *tc, tcp_rate_sample_t *rs, u32 ack)
Update RTT estimate and RTO timer.
Definition: tcp_input.c:497
enum _tcp_rcv_process_next tcp_rcv_process_next_t
static load_balance_t * load_balance_get(index_t lbi)
Definition: load_balance.h:220
#define seq_geq(_s1, _s2)
Definition: tcp.h:876
IPv4 main type.
Definition: ip4.h:105
static void tcp_cc_update(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp_input.c:1380
static void tcp_handle_postponed_dequeues(tcp_worker_ctx_t *wrk)
Dequeue bytes for connections that have received acks in last burst.
Definition: tcp_input.c:582
void tcp_bt_sample_delivery_rate(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Generate a delivery rate sample from recently acked bytes.
Definition: tcp_bt.c:582
static index_t ip4_fib_forwarding_lookup(u32 fib_index, const ip4_address_t *addr)
Definition: ip4_fib.h:160
static void tcp_estimate_initial_rtt(tcp_connection_t *tc)
Definition: tcp_input.c:551
static void vlib_buffer_advance(vlib_buffer_t *b, word l)
Advance current data pointer by the supplied (signed!) amount.
Definition: buffer.h:248
static int tcp_segment_check_paws(tcp_connection_t *tc)
RFC1323: Check against wrapped sequence numbers (PAWS).
Definition: tcp_input.c:241
static uword ip6_address_is_link_local_unicast(const ip6_address_t *a)
Definition: ip6_packet.h:250
static u8 tcp_cc_is_spurious_timeout_rxt(tcp_connection_t *tc)
Definition: tcp_input.c:1278
static void tcp_established_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, u8 is_ip4)
Definition: tcp_input.c:2074
enum _tcp_input_next tcp_input_next_t
static void scoreboard_update_sacked_rxt(sack_scoreboard_t *sb, u32 start, u32 end, u8 has_rxt)
Definition: tcp_input.c:761
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
Definition: tcp_input.c:1743
#define tcp_fastrecovery_first_off(tc)
Definition: tcp.h:472
int session_stream_accept_notify(transport_connection_t *tc)
Definition: session.c:996
Out-of-order segment.
Definition: svm_fifo.h:29
static u8 tcp_segment_in_rcv_wnd(tcp_connection_t *tc, u32 seq, u32 end_seq)
Validate segment sequence number.
Definition: tcp_input.c:112
#define clib_max(x, y)
Definition: clib.h:288
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
static u32 tcp_time_now_w_thread(u32 thread_index)
Definition: tcp.h:1012
static clib_error_t * tcp_init(vlib_main_t *vm)
Definition: tcp.c:1695
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
#define vec_elt(v, i)
Get vector value at index i.
u8 ip_is_zero(ip46_address_t *ip46_address, u8 is_ip4)
Definition: ip.c:20
#define seq_lt(_s1, _s2)
Definition: tcp.h:873
#define tcp_is_syn(_th)
Definition: tcp_packet.h:89
#define tcp_opts_wscale(_to)
Definition: tcp_packet.h:157
enum _tcp_syn_sent_next tcp_syn_sent_next_t
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u32 thread_index, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:778
static void tcp_update_snd_wnd(tcp_connection_t *tc, u32 seq, u32 ack, u32 snd_wnd)
Try to update snd_wnd based on feedback received from peer.
Definition: tcp_input.c:1204
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
Definition: tcp.c:342
u32 tsval
Timestamp value.
Definition: tcp_packet.h:148
enum _tcp_established_next tcp_established_next_t
u16 payload_length
Definition: ip6_packet.h:298
u32 tsecr
Echoed/reflected time stamp.
Definition: tcp_packet.h:149
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
Definition: tcp_input.c:3786
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:1007
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
enum _tcp_listen_next tcp_listen_next_t
#define foreach_tcp_state_next
Definition: tcp_input.c:31
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:140
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
Definition: tcp.h:989
static u32 scoreboard_hole_bytes(sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:643
static void tcp_cc_rcv_ack(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp.h:1053
static tcp_worker_ctx_t * tcp_get_worker(u32 thread_index)
Definition: tcp.h:690
void session_transport_closed_notify(transport_connection_t *tc)
Notification from transport that it is closed.
Definition: session.c:946
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
Definition: tcp.h:1199
VLIB buffer representation.
Definition: buffer.h:102
static int tcp_session_enqueue_data(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue data for delivery to application.
Definition: tcp_input.c:1803
static u8 tcp_should_fastrecover_sack(tcp_connection_t *tc)
Definition: tcp_input.c:1293
u64 uword
Definition: types.h:112
#define seq_max(_s1, _s2)
Definition: tcp.h:877
sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:657
sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:665
static void * vlib_frame_vector_args(vlib_frame_t *f)
Get pointer to frame vector data.
Definition: node_funcs.h:244
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
Definition: tcp.c:726
static void tcp_cc_recovered(tcp_connection_t *tc)
Definition: tcp.h:1079
static void scoreboard_remove_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:689
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_unsent, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
Definition: tcp_input.c:845
#define TCP_OPTION_LEN_MSS
Definition: tcp_packet.h:164
sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
Definition: tcp_input.c:681
session_t * session_lookup_listener6(u32 fib_index, ip6_address_t *lcl, u16 lcl_port, u8 proto, u8 use_wildcard)
#define tcp_disconnect_pending(tc)
Definition: tcp.h:467
left
#define TCP_RTO_MIN
Definition: tcp.h:100
static tcp_connection_t * tcp_lookup_listener(vlib_buffer_t *b, u32 fib_index, int is_ip4)
Definition: tcp_input.c:2391
static u32 ooo_segment_offset_prod(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:713
struct clib_bihash_value offset
template key/value backing page structure
#define tcp_scoreboard_trace_add(_tc, _ack)
Definition: tcp.h:229
#define vnet_buffer(b)
Definition: buffer.h:408
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp.h:717
static u32 scoreboard_hole_index(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:636
static u8 tcp_lookup_is_valid(tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *hdr)
Definition: tcp_input.c:2283
ip4_main_t ip4_main
Global ip4 main structure.
Definition: ip4_forward.c:1079
static int tcp_header_bytes(tcp_header_t *t)
Definition: tcp_packet.h:93
int session_stream_connect_notify(transport_connection_t *tc, u8 is_fail)
Definition: session.c:757
vl_api_dhcp_client_state_t state
Definition: dhcp.api:201
#define tcp_disconnect_pending_off(tc)
Definition: tcp.h:469
static u32 vlib_num_workers()
Definition: threads.h:372
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
Definition: tcp.c:238
void tcp_connection_del(tcp_connection_t *tc)
Connection removal.
Definition: tcp.c:291
f64 end
end of the time range
Definition: mactime.api:44
vlib_main_t vlib_node_runtime_t vlib_frame_t * frame
Definition: in2out_ed.c:1811
u16 flags
Copy of main node flags.
Definition: node.h:509
Window scale.
Definition: tcp_packet.h:107
u32 session_tx_fifo_dequeue_drop(transport_connection_t *tc, u32 max_bytes)
Definition: session.c:511
void tcp_program_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1176
vlib_node_registration_t tcp6_listen_node
(constructor) VLIB_REGISTER_NODE (tcp6_listen_node)
Definition: tcp_input.c:3392
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:159
static int ip4_header_bytes(const ip4_header_t *i)
Definition: ip4_packet.h:235
u32 ip4
Definition: one.api:440
Timestamps.
Definition: tcp_packet.h:110
int session_stream_accept(transport_connection_t *tc, u32 listener_index, u32 thread_index, u8 notify)
Accept a stream session.
Definition: session.c:1013
static_always_inline void vlib_get_buffers(vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, int count)
Translate array of buffer indices into buffer pointers.
Definition: buffer_funcs.h:244
#define VLIB_NODE_FLAG_TRACE
Definition: node.h:302
tcp_bts_flags_t flags
Rate sample flags from bt sample.
Definition: tcp.h:286
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:59
static transport_connection_t * transport_get_listener(transport_proto_t tp, u32 conn_index)
Definition: transport.h:126
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:167
static uword tcp46_input_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4, u8 is_nolookup)
Definition: tcp_input.c:3607
static void tcp_persist_timer_set(tcp_connection_t *tc)
Definition: tcp.h:1172
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:684
static uword tcp46_syn_sent_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:2455
#define tcp_fastrecovery_off(tc)
Definition: tcp.h:461
static uword tcp46_rcv_process_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED as per RFC793 p...
Definition: tcp_input.c:2765
session_t * session_lookup_listener4(u32 fib_index, ip4_address_t *lcl, u16 lcl_port, u8 proto, u8 use_wildcard)
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:1159
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:85
static void tcp_input_dispatch_buffer(tcp_main_t *tm, tcp_connection_t *tc, vlib_buffer_t *b, u16 *next, u32 *error)
Definition: tcp_input.c:3580
vlib_node_registration_t tcp6_input_nolookup_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_nolookup_node)
Definition: tcp_input.c:3752
static u32 tcp_set_time_now(tcp_worker_ctx_t *wrk)
Definition: tcp.h:1034
static void tcp_handle_old_ack(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp_input.c:1519
#define tcp_ack(_th)
Definition: tcp_packet.h:83
static u32 transport_tx_fifo_size(transport_connection_t *tc)
Definition: session.h:499
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp.h:1213
transport_connection_t * session_lookup_half_open_connection(u64 handle, u8 proto, u8 is_ip4)
Definition: defs.h:46
static tcp_connection_t * tcp_listener_get(u32 tli)
Definition: tcp.h:768
static void tcp_cc_congestion(tcp_connection_t *tc)
Definition: tcp.h:1067
ip6_address_t dst_address
Definition: ip6_packet.h:307
static u8 tcp_ack_is_cc_event(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una, u8 *is_dack)
Checks if ack is a congestion control event.
Definition: tcp_input.c:1552
static void tcp_persist_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:1193
static char * tcp_error_strings[]
Definition: tcp_input.c:24
#define TCP_EVT(_evt, _args...)
Definition: tcp_debug.h:145
static uword pool_elts(void *v)
Number of active elements in a pool.
Definition: pool.h:128