FD.io VPP  v19.08.1-401-g8e4ed521a
Vector Packet Processing
tcp_input.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2019 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <vppinfra/sparse_vec.h>
17 #include <vnet/fib/ip4_fib.h>
18 #include <vnet/fib/ip6_fib.h>
19 #include <vnet/tcp/tcp_packet.h>
20 #include <vnet/tcp/tcp.h>
21 #include <vnet/session/session.h>
22 #include <math.h>
23 
24 static char *tcp_error_strings[] = {
25 #define tcp_error(n,s) s,
26 #include <vnet/tcp/tcp_error.def>
27 #undef tcp_error
28 };
29 
30 /* All TCP nodes have the same outgoing arcs */
31 #define foreach_tcp_state_next \
32  _ (DROP4, "ip4-drop") \
33  _ (DROP6, "ip6-drop") \
34  _ (TCP4_OUTPUT, "tcp4-output") \
35  _ (TCP6_OUTPUT, "tcp6-output")
36 
37 typedef enum _tcp_established_next
38 {
39 #define _(s,n) TCP_ESTABLISHED_NEXT_##s,
41 #undef _
44 
45 typedef enum _tcp_rcv_process_next
46 {
47 #define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
49 #undef _
52 
53 typedef enum _tcp_syn_sent_next
54 {
55 #define _(s,n) TCP_SYN_SENT_NEXT_##s,
57 #undef _
60 
61 typedef enum _tcp_listen_next
62 {
63 #define _(s,n) TCP_LISTEN_NEXT_##s,
65 #undef _
68 
69 /* Generic, state independent indices */
70 typedef enum _tcp_state_next
71 {
72 #define _(s,n) TCP_NEXT_##s,
74 #undef _
77 
78 #define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
79  : TCP_NEXT_TCP6_OUTPUT)
80 
81 #define tcp_next_drop(is_ip4) (is_ip4 ? TCP_NEXT_DROP4 \
82  : TCP_NEXT_DROP6)
83 
84 /**
85  * Validate segment sequence number. As per RFC793:
86  *
87  * Segment Receive Test
88  * Length Window
89  * ------- ------- -------------------------------------------
90  * 0 0 SEG.SEQ = RCV.NXT
91  * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
92  * >0 0 not acceptable
93  * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
94  * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
95  *
96  * This ultimately consists in checking if segment falls within the window.
97  * The one important difference compared to RFC793 is that we use rcv_las,
98  * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
99  * peer's reference when computing our receive window.
100  *
101  * This:
102  * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
103  * however, is too strict when we have retransmits. Instead we just check that
104  * the seq is not beyond the right edge and that the end of the segment is not
105  * less than the left edge.
106  *
107  * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
108  * use rcv_nxt in the right edge window test instead of rcv_las.
109  *
110  */
113 {
114  return (seq_geq (end_seq, tc->rcv_las)
115  && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
116 }
117 
118 /**
119  * Parse TCP header options.
120  *
121  * @param th TCP header
122  * @param to TCP options data structure to be populated
123  * @param is_syn set if packet is syn
124  * @return -1 if parsing failed
125  */
126 static inline int
128 {
129  const u8 *data;
130  u8 opt_len, opts_len, kind;
131  int j;
132  sack_block_t b;
133 
134  opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
135  data = (const u8 *) (th + 1);
136 
137  /* Zero out all flags but those set in SYN */
138  to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
139  | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
140 
141  for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
142  {
143  kind = data[0];
144 
145  /* Get options length */
146  if (kind == TCP_OPTION_EOL)
147  break;
148  else if (kind == TCP_OPTION_NOOP)
149  {
150  opt_len = 1;
151  continue;
152  }
153  else
154  {
155  /* broken options */
156  if (opts_len < 2)
157  return -1;
158  opt_len = data[1];
159 
160  /* weird option length */
161  if (opt_len < 2 || opt_len > opts_len)
162  return -1;
163  }
164 
165  /* Parse options */
166  switch (kind)
167  {
168  case TCP_OPTION_MSS:
169  if (!is_syn)
170  break;
171  if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
172  {
173  to->flags |= TCP_OPTS_FLAG_MSS;
174  to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
175  }
176  break;
178  if (!is_syn)
179  break;
180  if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
181  {
182  to->flags |= TCP_OPTS_FLAG_WSCALE;
183  to->wscale = data[2];
184  if (to->wscale > TCP_MAX_WND_SCALE)
186  }
187  break;
189  if (is_syn)
190  to->flags |= TCP_OPTS_FLAG_TSTAMP;
191  if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
192  && opt_len == TCP_OPTION_LEN_TIMESTAMP)
193  {
194  to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
195  to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
196  }
197  break;
199  if (!is_syn)
200  break;
201  if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
202  to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
203  break;
205  /* If SACK permitted was not advertised or a SYN, break */
206  if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
207  break;
208 
209  /* If too short or not correctly formatted, break */
210  if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
211  break;
212 
213  to->flags |= TCP_OPTS_FLAG_SACK;
214  to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
215  vec_reset_length (to->sacks);
216  for (j = 0; j < to->n_sack_blocks; j++)
217  {
218  b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
219  b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
220  vec_add1 (to->sacks, b);
221  }
222  break;
223  default:
224  /* Nothing to see here */
225  continue;
226  }
227  }
228  return 0;
229 }
230 
231 /**
232  * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
233  * timestamp to echo and it's less than tsval_recent, drop segment
234  * but still send an ACK in order to retain TCP's mechanism for detecting
235  * and recovering from half-open connections
236  *
237  * Or at least that's what the theory says. It seems that this might not work
238  * very well with packet reordering and fast retransmit. XXX
239  */
240 always_inline int
242 {
243  return tcp_opts_tstamp (&tc->rcv_opts)
244  && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
245 }
246 
247 /**
248  * Update tsval recent
249  */
250 always_inline void
252 {
253  /*
254  * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
255  * of an incoming segment:
256  * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
257  * then the TSval from the segment is copied to TS.Recent;
258  * otherwise, the TSval is ignored.
259  */
260  if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las)
261  && seq_leq (tc->rcv_las, seq_end))
262  {
263  ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
264  tc->tsval_recent = tc->rcv_opts.tsval;
265  tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
266  }
267 }
268 
269 /**
270  * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
271  *
272  * It first verifies if segment has a wrapped sequence number (PAWS) and then
273  * does the processing associated to the first four steps (ignoring security
274  * and precedence): sequence number, rst bit and syn bit checks.
275  *
276  * @return 0 if segments passes validation.
277  */
278 static int
280  vlib_buffer_t * b0, tcp_header_t * th0, u32 * error0)
281 {
282  /* We could get a burst of RSTs interleaved with acks */
283  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
284  {
285  tcp_send_reset (tc0);
286  *error0 = TCP_ERROR_CONNECTION_CLOSED;
287  goto error;
288  }
289 
290  if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
291  {
292  *error0 = TCP_ERROR_SEGMENT_INVALID;
293  goto error;
294  }
295 
296  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts, 0)))
297  {
298  *error0 = TCP_ERROR_OPTIONS;
299  goto error;
300  }
301 
303  {
304  *error0 = TCP_ERROR_PAWS;
305  TCP_EVT (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
306  vnet_buffer (b0)->tcp.seq_end);
307 
308  /* If it just so happens that a segment updates tsval_recent for a
309  * segment over 24 days old, invalidate tsval_recent. */
310  if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
311  tcp_time_now_w_thread (tc0->c_thread_index)))
312  {
313  tc0->tsval_recent = tc0->rcv_opts.tsval;
314  clib_warning ("paws failed: 24-day old segment");
315  }
316  /* Drop after ack if not rst. Resets can fail paws check as per
317  * RFC 7323 sec. 5.2: When an <RST> segment is received, it MUST NOT
318  * be subjected to the PAWS check by verifying an acceptable value in
319  * SEG.TSval */
320  else if (!tcp_rst (th0))
321  {
322  tcp_program_ack (tc0);
323  TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
324  goto error;
325  }
326  }
327 
328  /* 1st: check sequence number */
329  if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
330  vnet_buffer (b0)->tcp.seq_end))
331  {
332  /* SYN/SYN-ACK retransmit */
333  if (tcp_syn (th0)
334  && vnet_buffer (b0)->tcp.seq_number == tc0->rcv_nxt - 1)
335  {
336  tcp_options_parse (th0, &tc0->rcv_opts, 1);
337  if (tc0->state == TCP_STATE_SYN_RCVD)
338  {
339  tcp_send_synack (tc0);
340  TCP_EVT (TCP_EVT_SYN_RCVD, tc0, 0);
341  *error0 = TCP_ERROR_SYNS_RCVD;
342  }
343  else
344  {
345  tcp_program_ack (tc0);
346  TCP_EVT (TCP_EVT_SYNACK_RCVD, tc0);
347  *error0 = TCP_ERROR_SYN_ACKS_RCVD;
348  }
349  goto error;
350  }
351 
352  /* If our window is 0 and the packet is in sequence, let it pass
353  * through for ack processing. It should be dropped later. */
354  if (tc0->rcv_wnd < tc0->snd_mss
355  && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
356  goto check_reset;
357 
358  /* If we entered recovery and peer did so as well, there's a chance that
359  * dup acks won't be acceptable on either end because seq_end may be less
360  * than rcv_las. This can happen if acks are lost in both directions. */
361  if (tcp_in_recovery (tc0)
362  && seq_geq (vnet_buffer (b0)->tcp.seq_number,
363  tc0->rcv_las - tc0->rcv_wnd)
364  && seq_leq (vnet_buffer (b0)->tcp.seq_end,
365  tc0->rcv_nxt + tc0->rcv_wnd))
366  goto check_reset;
367 
368  *error0 = TCP_ERROR_RCV_WND;
369 
370  /* If we advertised a zero rcv_wnd and the segment is in the past or the
371  * next one that we expect, it is probably a window probe */
372  if ((tc0->flags & TCP_CONN_ZERO_RWND_SENT)
373  && seq_lt (vnet_buffer (b0)->tcp.seq_end,
374  tc0->rcv_las + tc0->rcv_opts.mss))
375  *error0 = TCP_ERROR_ZERO_RWND;
376 
377  tc0->errors.below_data_wnd += seq_lt (vnet_buffer (b0)->tcp.seq_end,
378  tc0->rcv_las);
379 
380  /* If not RST, send dup ack */
381  if (!tcp_rst (th0))
382  {
383  tcp_program_dupack (tc0);
384  TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
385  }
386  goto error;
387 
388  check_reset:
389  ;
390  }
391 
392  /* 2nd: check the RST bit */
393  if (PREDICT_FALSE (tcp_rst (th0)))
394  {
395  tcp_connection_reset (tc0);
396  *error0 = TCP_ERROR_RST_RCVD;
397  goto error;
398  }
399 
400  /* 3rd: check security and precedence (skip) */
401 
402  /* 4th: check the SYN bit (in window) */
403  if (PREDICT_FALSE (tcp_syn (th0)))
404  {
405  /* As per RFC5961 send challenge ack instead of reset */
406  tcp_program_ack (tc0);
407  *error0 = TCP_ERROR_SPURIOUS_SYN;
408  goto error;
409  }
410 
411  /* If segment in window, save timestamp */
412  tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
413  vnet_buffer (b0)->tcp.seq_end);
414  return 0;
415 
416 error:
417  return -1;
418 }
419 
420 always_inline int
422 {
423  /* SND.UNA =< SEG.ACK =< SND.NXT */
424  if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number)
425  && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
426  {
427  if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
428  && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
429  {
430  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
431  goto acceptable;
432  }
433  *error = TCP_ERROR_ACK_INVALID;
434  return -1;
435  }
436 
437 acceptable:
438  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
439  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
440  *error = TCP_ERROR_ACK_OK;
441  return 0;
442 }
443 
444 /**
445  * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
446  *
447  * Note that although the original article, srtt and rttvar are scaled
448  * to minimize round-off errors, here we don't. Instead, we rely on
449  * better precision time measurements.
450  *
451  * TODO support us rtt resolution
452  */
453 static void
455 {
456  int err, diff;
457 
458  if (tc->srtt != 0)
459  {
460  err = mrtt - tc->srtt;
461 
462  /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
463  * The increase should be bound */
464  tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
465  diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
466  tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
467  }
468  else
469  {
470  /* First measurement. */
471  tc->srtt = mrtt;
472  tc->rttvar = mrtt >> 1;
473  }
474 }
475 
476 #ifndef CLIB_MARCH_VARIANT
477 void
479 {
480  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
481  tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
482 }
483 #endif /* CLIB_MARCH_VARIANT */
484 
485 /**
486  * Update RTT estimate and RTO timer
487  *
488  * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
489  * timing. Middle boxes are known to fiddle with TCP options so we
490  * should give higher priority to ACK timing.
491  *
492  * This should be called only if previously sent bytes have been acked.
493  *
494  * return 1 if valid rtt 0 otherwise
495  */
496 static int
498 {
499  u32 mrtt = 0;
500 
501  /* Karn's rule, part 1. Don't use retransmitted segments to estimate
502  * RTT because they're ambiguous. */
503  if (tcp_in_cong_recovery (tc))
504  {
505  /* Accept rtt estimates for samples that have not been retransmitted */
506  if ((tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
507  && !(rs->flags & TCP_BTS_IS_RXT))
508  {
509  mrtt = rs->rtt_time * THZ;
510  goto estimate_rtt;
511  }
512  goto done;
513  }
514 
515  if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
516  {
517  f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
518  tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125;
519  mrtt = clib_max ((u32) (sample * THZ), 1);
520  /* Allow measuring of a new RTT */
521  tc->rtt_ts = 0;
522  }
523  /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
524  * snd_una, i.e., the left side of the send window:
525  * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
526  else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
527  {
528  u32 now = tcp_tstamp (tc);
529  mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
530  }
531 
532 estimate_rtt:
533 
534  /* Ignore dubious measurements */
535  if (mrtt == 0 || mrtt > TCP_RTT_MAX)
536  goto done;
537 
538  tcp_estimate_rtt (tc, mrtt);
539 
540 done:
541 
542  /* If we got here something must've been ACKed so make sure boff is 0,
543  * even if mrtt is not valid since we update the rto lower */
544  tc->rto_boff = 0;
545  tcp_update_rto (tc);
546 
547  return 0;
548 }
549 
550 static void
552 {
553  u8 thread_index = vlib_num_workers ()? 1 : 0;
554  int mrtt;
555 
556  if (tc->rtt_ts)
557  {
558  tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
559  tc->mrtt_us = clib_max (tc->mrtt_us, 0.0001);
560  mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
561  tc->rtt_ts = 0;
562  }
563  else
564  {
565  mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
566  mrtt = clib_max (mrtt, 1);
567  /* Due to retransmits we don't know the initial mrtt */
568  if (tc->rto_boff && mrtt > 1 * THZ)
569  mrtt = 1 * THZ;
570  tc->mrtt_us = (f64) mrtt *TCP_TICK;
571  }
572 
573  if (mrtt > 0 && mrtt < TCP_RTT_MAX)
574  tcp_estimate_rtt (tc, mrtt);
575  tcp_update_rto (tc);
576 }
577 
580 {
581  u32 space;
582 
584 
585  if (tcp_in_recovery (tc))
586  space = tcp_available_output_snd_space (tc);
587  else
588  space = tcp_fastrecovery_prr_snd_space (tc);
589 
590  return (space < tc->snd_mss + tc->burst_acked);
591 }
592 
593 /**
594  * Dequeue bytes for connections that have received acks in last burst
595  */
596 static void
598 {
599  u32 thread_index = wrk->vm->thread_index;
600  u32 *pending_deq_acked;
601  tcp_connection_t *tc;
602  int i;
603 
604  if (!vec_len (wrk->pending_deq_acked))
605  return;
606 
607  pending_deq_acked = wrk->pending_deq_acked;
608  for (i = 0; i < vec_len (pending_deq_acked); i++)
609  {
610  tc = tcp_connection_get (pending_deq_acked[i], thread_index);
611  tc->flags &= ~TCP_CONN_DEQ_PENDING;
612 
613  if (tc->burst_acked)
614  {
615  /* Dequeue the newly ACKed bytes */
616  session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
617  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
618 
619  if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
620  {
621  if (seq_leq (tc->psh_seq, tc->snd_una))
622  tc->flags &= ~TCP_CONN_PSH_PENDING;
623  }
624 
625  /* If everything has been acked, stop retransmit timer
626  * otherwise update. */
628 
629  /* Update pacer based on our new cwnd estimate */
631  }
632 
633  /* Reset the pacer if we've been idle, i.e., no data sent or if
634  * we're in recovery and snd space constrained */
635  if (tc->data_segs_out == tc->prev_dsegs_out
638 
639  tc->prev_dsegs_out = tc->data_segs_out;
640  tc->burst_acked = 0;
641  }
642  _vec_len (wrk->pending_deq_acked) = 0;
643 }
644 
645 static void
647 {
648  if (!(tc->flags & TCP_CONN_DEQ_PENDING))
649  {
650  vec_add1 (wrk->pending_deq_acked, tc->c_c_index);
651  tc->flags |= TCP_CONN_DEQ_PENDING;
652  }
653  tc->burst_acked += tc->bytes_acked;
654 }
655 
656 #ifndef CLIB_MARCH_VARIANT
657 static u32
659 {
660  ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes));
661  return hole - sb->holes;
662 }
663 
664 static u32
666 {
667  return hole->end - hole->start;
668 }
669 
672 {
673  if (index != TCP_INVALID_SACK_HOLE_INDEX)
674  return pool_elt_at_index (sb->holes, index);
675  return 0;
676 }
677 
680 {
681  if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
682  return pool_elt_at_index (sb->holes, hole->next);
683  return 0;
684 }
685 
688 {
689  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
690  return pool_elt_at_index (sb->holes, hole->prev);
691  return 0;
692 }
693 
696 {
697  if (sb->head != TCP_INVALID_SACK_HOLE_INDEX)
698  return pool_elt_at_index (sb->holes, sb->head);
699  return 0;
700 }
701 
704 {
705  if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX)
706  return pool_elt_at_index (sb->holes, sb->tail);
707  return 0;
708 }
709 
710 static void
712 {
713  sack_scoreboard_hole_t *next, *prev;
714 
715  if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
716  {
717  next = pool_elt_at_index (sb->holes, hole->next);
718  next->prev = hole->prev;
719  }
720  else
721  {
722  sb->tail = hole->prev;
723  }
724 
725  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
726  {
727  prev = pool_elt_at_index (sb->holes, hole->prev);
728  prev->next = hole->next;
729  }
730  else
731  {
732  sb->head = hole->next;
733  }
734 
735  if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
736  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
737 
738  /* Poison the entry */
739  if (CLIB_DEBUG > 0)
740  clib_memset (hole, 0xfe, sizeof (*hole));
741 
742  pool_put (sb->holes, hole);
743 }
744 
745 static sack_scoreboard_hole_t *
747  u32 start, u32 end)
748 {
749  sack_scoreboard_hole_t *hole, *next, *prev;
750  u32 hole_index;
751 
752  pool_get (sb->holes, hole);
753  clib_memset (hole, 0, sizeof (*hole));
754 
755  hole->start = start;
756  hole->end = end;
757  hole_index = scoreboard_hole_index (sb, hole);
758 
759  prev = scoreboard_get_hole (sb, prev_index);
760  if (prev)
761  {
762  hole->prev = prev_index;
763  hole->next = prev->next;
764 
765  if ((next = scoreboard_next_hole (sb, hole)))
766  next->prev = hole_index;
767  else
768  sb->tail = hole_index;
769 
770  prev->next = hole_index;
771  }
772  else
773  {
774  sb->head = hole_index;
775  hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
776  hole->next = TCP_INVALID_SACK_HOLE_INDEX;
777  }
778 
779  return hole;
780 }
781 
782 always_inline void
784  u8 has_rxt)
785 {
786  if (!has_rxt || seq_geq (start, sb->high_rxt))
787  return;
788 
789  sb->rxt_sacked +=
790  seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start);
791 }
792 
793 always_inline void
795 {
797  u32 sacked = 0, blks = 0, old_sacked;
798 
799  old_sacked = sb->sacked_bytes;
800 
801  sb->last_lost_bytes = 0;
802  sb->lost_bytes = 0;
803  sb->sacked_bytes = 0;
804 
805  right = scoreboard_last_hole (sb);
806  if (!right)
807  {
808  sb->sacked_bytes = sb->high_sacked - ack;
809  return;
810  }
811 
812  if (seq_gt (sb->high_sacked, right->end))
813  {
814  sacked = sb->high_sacked - right->end;
815  blks = 1;
816  }
817 
818  while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss
819  && blks < TCP_DUPACK_THRESHOLD)
820  {
821  if (right->is_lost)
822  sb->lost_bytes += scoreboard_hole_bytes (right);
823 
824  left = scoreboard_prev_hole (sb, right);
825  if (!left)
826  {
827  ASSERT (right->start == ack || sb->is_reneging);
828  sacked += right->start - ack;
829  right = 0;
830  break;
831  }
832 
833  sacked += right->start - left->end;
834  blks++;
835  right = left;
836  }
837 
838  /* right is first lost */
839  while (right)
840  {
841  sb->lost_bytes += scoreboard_hole_bytes (right);
842  sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start);
843  right->is_lost = 1;
844  left = scoreboard_prev_hole (sb, right);
845  if (!left)
846  {
847  ASSERT (right->start == ack || sb->is_reneging);
848  sacked += right->start - ack;
849  break;
850  }
851  sacked += right->start - left->end;
852  right = left;
853  }
854 
855  sb->sacked_bytes = sacked;
856  sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered);
857 }
858 
859 /**
860  * Figure out the next hole to retransmit
861  *
862  * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
863  */
866  sack_scoreboard_hole_t * start,
867  u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
868 {
869  sack_scoreboard_hole_t *hole = 0;
870 
871  hole = start ? start : scoreboard_first_hole (sb);
872  while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
873  hole = scoreboard_next_hole (sb, hole);
874 
875  /* Nothing, return */
876  if (!hole)
877  {
878  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
879  return 0;
880  }
881 
882  /* Rule (1): if higher than rxt, less than high_sacked and lost */
883  if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
884  {
885  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
886  }
887  else
888  {
889  /* Rule (2): available unsent data */
890  if (have_unsent)
891  {
892  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
893  return 0;
894  }
895  /* Rule (3): if hole not lost */
896  else if (seq_lt (hole->start, sb->high_sacked))
897  {
898  /* And we didn't already retransmit it */
899  if (seq_leq (hole->end, sb->high_rxt))
900  {
901  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
902  return 0;
903  }
904  *snd_limited = 0;
905  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
906  }
907  /* Rule (4): if hole beyond high_sacked */
908  else
909  {
910  ASSERT (seq_geq (hole->start, sb->high_sacked));
911  *snd_limited = 1;
912  *can_rescue = 1;
913  /* HighRxt MUST NOT be updated */
914  return 0;
915  }
916  }
917 
918  if (hole && seq_lt (sb->high_rxt, hole->start))
919  sb->high_rxt = hole->start;
920 
921  return hole;
922 }
923 
924 void
926 {
928  hole = scoreboard_first_hole (sb);
929  if (hole)
930  {
931  snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
932  sb->cur_rxt_hole = sb->head;
933  }
934  sb->high_rxt = snd_una;
935  sb->rescue_rxt = snd_una - 1;
936 }
937 
938 void
940 {
941  sb->head = TCP_INVALID_SACK_HOLE_INDEX;
942  sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
943  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
944 }
945 
946 void
948 {
950  while ((hole = scoreboard_first_hole (sb)))
951  {
952  scoreboard_remove_hole (sb, hole);
953  }
954  ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
955  ASSERT (pool_elts (sb->holes) == 0);
956  sb->sacked_bytes = 0;
957  sb->last_sacked_bytes = 0;
958  sb->last_bytes_delivered = 0;
959  sb->lost_bytes = 0;
960  sb->last_lost_bytes = 0;
961  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
962  sb->is_reneging = 0;
963 }
964 
965 void
967 {
968  sack_scoreboard_hole_t *last_hole;
969 
970  clib_warning ("sack reneging");
971 
972  scoreboard_clear (sb);
974  start, end);
975  last_hole->is_lost = 1;
976  sb->tail = scoreboard_hole_index (sb, last_hole);
977  sb->high_sacked = start;
978  scoreboard_init_rxt (sb, start);
979 }
980 
981 #endif /* CLIB_MARCH_VARIANT */
982 
983 /**
984  * Test that scoreboard is sane after recovery
985  *
986  * Returns 1 if scoreboard is empty or if first hole beyond
987  * snd_una.
988  */
989 static u8
991 {
993  hole = scoreboard_first_hole (&tc->sack_sb);
994  return (!hole || (seq_geq (hole->start, tc->snd_una)
995  && seq_lt (hole->end, tc->snd_nxt)));
996 }
997 
998 #ifndef CLIB_MARCH_VARIANT
999 
1000 void
1002 {
1003  sack_scoreboard_hole_t *hole, *next_hole;
1004  sack_scoreboard_t *sb = &tc->sack_sb;
1005  sack_block_t *blk, *rcv_sacks;
1006  u32 blk_index = 0, i, j;
1007  u8 has_rxt;
1008 
1009  sb->last_sacked_bytes = 0;
1010  sb->last_bytes_delivered = 0;
1011  sb->rxt_sacked = 0;
1012 
1013  if (!tcp_opts_sack (&tc->rcv_opts)
1014  && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
1015  return;
1016 
1017  has_rxt = tcp_in_cong_recovery (tc);
1018 
1019  /* Remove invalid blocks */
1020  blk = tc->rcv_opts.sacks;
1021  while (blk < vec_end (tc->rcv_opts.sacks))
1022  {
1023  if (seq_lt (blk->start, blk->end)
1024  && seq_gt (blk->start, tc->snd_una)
1025  && seq_gt (blk->start, ack)
1026  && seq_lt (blk->start, tc->snd_nxt)
1027  && seq_leq (blk->end, tc->snd_nxt))
1028  {
1029  blk++;
1030  continue;
1031  }
1032  vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
1033  }
1034 
1035  /* Add block for cumulative ack */
1036  if (seq_gt (ack, tc->snd_una))
1037  {
1038  vec_add2 (tc->rcv_opts.sacks, blk, 1);
1039  blk->start = tc->snd_una;
1040  blk->end = ack;
1041  }
1042 
1043  if (vec_len (tc->rcv_opts.sacks) == 0)
1044  return;
1045 
1046  tcp_scoreboard_trace_add (tc, ack);
1047 
1048  /* Make sure blocks are ordered */
1049  rcv_sacks = tc->rcv_opts.sacks;
1050  for (i = 0; i < vec_len (rcv_sacks); i++)
1051  for (j = i + 1; j < vec_len (rcv_sacks); j++)
1052  if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start))
1053  {
1054  sack_block_t tmp = rcv_sacks[i];
1055  rcv_sacks[i] = rcv_sacks[j];
1056  rcv_sacks[j] = tmp;
1057  }
1058 
1059  if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
1060  {
1061  /* Handle reneging as a special case */
1062  if (PREDICT_FALSE (sb->is_reneging))
1063  {
1064  /* No holes, only sacked bytes */
1065  if (seq_leq (tc->snd_nxt, sb->high_sacked))
1066  {
1067  /* No progress made so return */
1068  if (seq_leq (ack, tc->snd_una))
1069  return;
1070 
1071  /* Update sacked bytes delivered and return */
1072  sb->last_bytes_delivered = ack - tc->snd_una;
1073  sb->sacked_bytes -= sb->last_bytes_delivered;
1074  sb->is_reneging = seq_lt (ack, sb->high_sacked);
1075  return;
1076  }
1077 
1078  /* New hole above high sacked. Add it and process normally */
1080  sb->high_sacked, tc->snd_nxt);
1081  sb->tail = scoreboard_hole_index (sb, hole);
1082  }
1083  /* Not reneging and no holes. Insert the first that covers all
1084  * outstanding bytes */
1085  else
1086  {
1088  tc->snd_una, tc->snd_nxt);
1089  sb->tail = scoreboard_hole_index (sb, hole);
1090  }
1091  sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end;
1092  }
1093  else
1094  {
1095  /* If we have holes but snd_nxt is beyond the last hole, update
1096  * last hole end or add new hole after high sacked */
1097  hole = scoreboard_last_hole (sb);
1098  if (seq_gt (tc->snd_nxt, hole->end))
1099  {
1100  if (seq_geq (hole->start, sb->high_sacked))
1101  {
1102  hole->end = tc->snd_nxt;
1103  }
1104  /* New hole after high sacked block */
1105  else if (seq_lt (sb->high_sacked, tc->snd_nxt))
1106  {
1107  scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
1108  tc->snd_nxt);
1109  }
1110  }
1111 
1112  /* Keep track of max byte sacked for when the last hole
1113  * is acked */
1114  sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end,
1115  sb->high_sacked);
1116  }
1117 
1118  /* Walk the holes with the SACK blocks */
1119  hole = pool_elt_at_index (sb->holes, sb->head);
1120 
1121  if (PREDICT_FALSE (sb->is_reneging))
1122  sb->last_bytes_delivered += hole->start - tc->snd_una;
1123 
1124  while (hole && blk_index < vec_len (rcv_sacks))
1125  {
1126  blk = &rcv_sacks[blk_index];
1127  if (seq_leq (blk->start, hole->start))
1128  {
1129  /* Block covers hole. Remove hole */
1130  if (seq_geq (blk->end, hole->end))
1131  {
1132  next_hole = scoreboard_next_hole (sb, hole);
1133 
1134  /* If covered by ack, compute delivered bytes */
1135  if (blk->end == ack)
1136  {
1137  u32 sacked = next_hole ? next_hole->start : sb->high_sacked;
1138  if (PREDICT_FALSE (seq_lt (ack, sacked)))
1139  {
1140  sb->last_bytes_delivered += ack - hole->end;
1141  sb->is_reneging = 1;
1142  }
1143  else
1144  {
1145  sb->last_bytes_delivered += sacked - hole->end;
1146  sb->is_reneging = 0;
1147  }
1148  }
1149  scoreboard_update_sacked_rxt (sb, hole->start, hole->end,
1150  has_rxt);
1151  scoreboard_remove_hole (sb, hole);
1152  hole = next_hole;
1153  }
1154  /* Partial 'head' overlap */
1155  else
1156  {
1157  if (seq_gt (blk->end, hole->start))
1158  {
1159  scoreboard_update_sacked_rxt (sb, hole->start, blk->end,
1160  has_rxt);
1161  hole->start = blk->end;
1162  }
1163  blk_index++;
1164  }
1165  }
1166  else
1167  {
1168  /* Hole must be split */
1169  if (seq_lt (blk->end, hole->end))
1170  {
1171  u32 hole_index = scoreboard_hole_index (sb, hole);
1172  next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
1173  hole->end);
1174  /* Pool might've moved */
1175  hole = scoreboard_get_hole (sb, hole_index);
1176  hole->end = blk->start;
1177 
1178  scoreboard_update_sacked_rxt (sb, blk->start, blk->end,
1179  has_rxt);
1180 
1181  blk_index++;
1182  ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
1183  }
1184  else if (seq_lt (blk->start, hole->end))
1185  {
1186  scoreboard_update_sacked_rxt (sb, blk->start, hole->end,
1187  has_rxt);
1188  hole->end = blk->start;
1189  }
1190  hole = scoreboard_next_hole (sb, hole);
1191  }
1192  }
1193 
1194  scoreboard_update_bytes (sb, ack, tc->snd_mss);
1195 
1196  ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
1197  ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
1198  || sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack));
1199  ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt
1200  - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
1202  || sb->is_reneging || sb->holes[sb->head].start == ack);
1203  ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
1204  ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes
1205  - sb->last_bytes_delivered >= sb->rxt_sacked);
1206  ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered
1207  || (tc->flags & TCP_CONN_FINSNT));
1208 
1209  TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc);
1210 }
1211 #endif /* CLIB_MARCH_VARIANT */
1212 
1213 /**
1214  * Try to update snd_wnd based on feedback received from peer.
1215  *
1216  * If successful, and new window is 'effectively' 0, activate persist
1217  * timer.
1218  */
1219 static void
1220 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
1221 {
1222  /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
1223  * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
1224  if (seq_lt (tc->snd_wl1, seq)
1225  || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
1226  {
1227  tc->snd_wnd = snd_wnd;
1228  tc->snd_wl1 = seq;
1229  tc->snd_wl2 = ack;
1230  TCP_EVT (TCP_EVT_SND_WND, tc);
1231 
1232  if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
1233  {
1234  /* Set persist timer if not set and we just got 0 wnd */
1235  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
1236  && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
1237  tcp_persist_timer_set (tc);
1238  }
1239  else
1240  {
1242  if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
1243  {
1244  tc->rto_boff = 0;
1245  tcp_update_rto (tc);
1246  }
1247  }
1248  }
1249 }
1250 
1251 /**
1252  * Init loss recovery/fast recovery.
1253  *
1254  * Triggered by dup acks as opposed to timer timeout. Note that cwnd is
1255  * updated in @ref tcp_cc_handle_event after fast retransmit
1256  */
1257 static void
1259 {
1260  tcp_fastrecovery_on (tc);
1261  tc->snd_congestion = tc->snd_nxt;
1262  tc->cwnd_acc_bytes = 0;
1263  tc->snd_rxt_bytes = 0;
1264  tc->rxt_delivered = 0;
1265  tc->prr_delivered = 0;
1266  tc->prr_start = tc->snd_una;
1267  tc->prev_ssthresh = tc->ssthresh;
1268  tc->prev_cwnd = tc->cwnd;
1269 
1270  tc->snd_rxt_ts = tcp_tstamp (tc);
1271  tcp_cc_congestion (tc);
1272 
1273  /* Post retransmit update cwnd to ssthresh and account for the
1274  * three segments that have left the network and should've been
1275  * buffered at the receiver XXX */
1276  if (!tcp_opts_sack_permitted (&tc->rcv_opts))
1277  tc->cwnd += 3 * tc->snd_mss;
1278 
1279  tc->fr_occurences += 1;
1280  TCP_EVT (TCP_EVT_CC_EVT, tc, 4);
1281 }
1282 
1283 static void
1285 {
1286  tc->cwnd = tc->prev_cwnd;
1287  tc->ssthresh = tc->prev_ssthresh;
1288  tcp_cc_undo_recovery (tc);
1289  ASSERT (tc->rto_boff == 0);
1290  TCP_EVT (TCP_EVT_CC_EVT, tc, 5);
1291 }
1292 
1293 static inline u8
1295 {
1296  return (tcp_in_recovery (tc) && tc->rto_boff == 1
1297  && tc->snd_rxt_ts
1298  && tcp_opts_tstamp (&tc->rcv_opts)
1299  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
1300 }
1301 
1302 static inline u8
1304 {
1305  return (tcp_cc_is_spurious_timeout_rxt (tc));
1306 }
1307 
1308 static inline u8
1310 {
1311  return (tc->sack_sb.lost_bytes
1312  || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
1313  < tc->sack_sb.sacked_bytes));
1314 }
1315 
1316 static inline u8
1318 {
1319  if (!has_sack)
1320  {
1321  /* If of of the two conditions lower hold, reset dupacks because
1322  * we're probably after timeout (RFC6582 heuristics).
1323  * If Cumulative ack does not cover more than congestion threshold,
1324  * and:
1325  * 1) The following doesn't hold: The congestion window is greater
1326  * than SMSS bytes and the difference between highest_ack
1327  * and prev_highest_ack is at most 4*SMSS bytes
1328  * 2) Echoed timestamp in the last non-dup ack does not equal the
1329  * stored timestamp
1330  */
1331  if (seq_leq (tc->snd_una, tc->snd_congestion)
1332  && ((!(tc->cwnd > tc->snd_mss
1333  && tc->bytes_acked <= 4 * tc->snd_mss))
1334  || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
1335  {
1336  tc->rcv_dupacks = 0;
1337  return 0;
1338  }
1339  }
1340  return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1341  || tcp_should_fastrecover_sack (tc));
1342 }
1343 
1344 static int
1346 {
1347  sack_scoreboard_hole_t *hole;
1348  u8 is_spurious = 0;
1349 
1351 
1353  {
1355  is_spurious = 1;
1356  }
1357 
1358  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
1359  tc->rcv_dupacks = 0;
1360 
1361  /* Previous recovery left us congested. Continue sending as part
1362  * of the current recovery event with an updated snd_congestion */
1363  if (tc->sack_sb.sacked_bytes)
1364  {
1365  tc->snd_congestion = tc->snd_nxt;
1367  return is_spurious;
1368  }
1369 
1370  tc->rxt_delivered = 0;
1371  tc->snd_rxt_bytes = 0;
1372  tc->snd_rxt_ts = 0;
1373  tc->prr_delivered = 0;
1374  tc->rtt_ts = 0;
1375  tc->flags &= ~TCP_CONN_RXT_PENDING;
1376 
1377  hole = scoreboard_first_hole (&tc->sack_sb);
1378  if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
1379  scoreboard_clear (&tc->sack_sb);
1380 
1381  if (!tcp_in_recovery (tc) && !is_spurious)
1382  tcp_cc_recovered (tc);
1383 
1384  tcp_fastrecovery_off (tc);
1386  tcp_recovery_off (tc);
1387  TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
1388 
1389  ASSERT (tc->rto_boff == 0);
1390  ASSERT (!tcp_in_cong_recovery (tc));
1392  return is_spurious;
1393 }
1394 
1395 static void
1397 {
1399 
1400  /* Congestion avoidance */
1401  tcp_cc_rcv_ack (tc, rs);
1402 
1403  /* If a cumulative ack, make sure dupacks is 0 */
1404  tc->rcv_dupacks = 0;
1405 
1406  /* When dupacks hits the threshold we only enter fast retransmit if
1407  * cumulative ack covers more than snd_congestion. Should snd_una
1408  * wrap this test may fail under otherwise valid circumstances.
1409  * Therefore, proactively update snd_congestion when wrap detected. */
1410  if (PREDICT_FALSE
1411  (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
1412  && seq_gt (tc->snd_congestion, tc->snd_una)))
1413  tc->snd_congestion = tc->snd_una - 1;
1414 }
1415 
1416 /**
1417  * One function to rule them all ... and in the darkness bind them
1418  */
1419 static void
1421  u32 is_dack)
1422 {
1423  u8 has_sack = tcp_opts_sack_permitted (&tc->rcv_opts);
1424 
1425  /*
1426  * If not in recovery, figure out if we should enter
1427  */
1428  if (!tcp_in_cong_recovery (tc))
1429  {
1430  ASSERT (is_dack);
1431 
1432  tc->rcv_dupacks++;
1433  TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
1435 
1436  if (tcp_should_fastrecover (tc, has_sack))
1437  {
1439 
1440  if (has_sack)
1441  scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
1442 
1443  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
1445  }
1446 
1447  return;
1448  }
1449 
1450  /*
1451  * Already in recovery
1452  */
1453 
1454  /*
1455  * Process (re)transmit feedback. Output path uses this to decide how much
1456  * more data to release into the network
1457  */
1458  if (has_sack)
1459  {
1460  if (!tc->bytes_acked && tc->sack_sb.rxt_sacked)
1462 
1463  tc->rxt_delivered += tc->sack_sb.rxt_sacked;
1464  tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes
1465  - tc->sack_sb.last_bytes_delivered;
1466 
1468  }
1469  else
1470  {
1471  if (is_dack)
1472  {
1473  tc->rcv_dupacks += 1;
1474  TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
1475  }
1476  tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked,
1477  tc->snd_rxt_bytes);
1478  if (is_dack)
1479  tc->prr_delivered += clib_min (tc->snd_mss,
1480  tc->snd_nxt - tc->snd_una);
1481  else
1482  tc->prr_delivered += tc->bytes_acked - clib_min (tc->bytes_acked,
1483  tc->snd_mss *
1484  tc->rcv_dupacks);
1485 
1486  /* If partial ack, assume that the first un-acked segment was lost */
1487  if (tc->bytes_acked || tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1489 
1491  }
1492 
1493  /*
1494  * See if we can exit and stop retransmitting
1495  */
1496  if (seq_geq (tc->snd_una, tc->snd_congestion))
1497  {
1498  /* If spurious return, we've already updated everything */
1499  if (tcp_cc_recover (tc))
1500  {
1501  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1502  return;
1503  }
1504 
1505  /* Treat as congestion avoidance ack */
1506  tcp_cc_rcv_ack (tc, rs);
1507  return;
1508  }
1509 
1510  /*
1511  * Notify cc of the event
1512  */
1513 
1514  if (!tc->bytes_acked)
1515  {
1517  return;
1518  }
1519 
1520  /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
1521  * reset dupacks to 0. Also needed if in congestion recovery */
1522  tc->rcv_dupacks = 0;
1523 
1524  if (tcp_in_recovery (tc))
1525  tcp_cc_rcv_ack (tc, rs);
1526  else
1528 }
1529 
1530 /**
1531  * Check if duplicate ack as per RFC5681 Sec. 2
1532  */
1535  u32 prev_snd_una)
1536 {
1537  return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
1538  && seq_gt (tc->snd_nxt, tc->snd_una)
1539  && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
1540  && (prev_snd_wnd == tc->snd_wnd));
1541 }
1542 
1543 /**
1544  * Checks if ack is a congestion control event.
1545  */
1546 static u8
1548  u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
1549 {
1550  /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
1551  * defined to be 'duplicate' as well */
1552  *is_dack = tc->sack_sb.last_sacked_bytes
1553  || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
1554 
1555  /* If reneging, wait for timer based retransmits */
1556  if (PREDICT_FALSE (tcp_is_lost_fin (tc) || tc->sack_sb.is_reneging))
1557  return 0;
1558 
1559  return (*is_dack || tcp_in_cong_recovery (tc));
1560 }
1561 
1562 /**
1563  * Process incoming ACK
1564  */
1565 static int
1567  tcp_header_t * th, u32 * error)
1568 {
1569  u32 prev_snd_wnd, prev_snd_una;
1570  tcp_rate_sample_t rs = { 0 };
1571  u8 is_dack;
1572 
1573  TCP_EVT (TCP_EVT_CC_STAT, tc);
1574 
1575  /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
1576  if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
1577  {
1578  /* We've probably entered recovery and the peer still has some
1579  * of the data we've sent. Update snd_nxt and accept the ack */
1580  if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
1581  && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
1582  {
1583  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
1584  goto process_ack;
1585  }
1586 
1587  tc->errors.above_ack_wnd += 1;
1588  *error = TCP_ERROR_ACK_FUTURE;
1589  TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number);
1590  return -1;
1591  }
1592 
1593  /* If old ACK, probably it's an old dupack */
1594  if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
1595  {
1596  tc->errors.below_ack_wnd += 1;
1597  *error = TCP_ERROR_ACK_OLD;
1598  TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number);
1599  if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1600  tcp_cc_handle_event (tc, 0, 1);
1601  /* Don't drop yet */
1602  return 0;
1603  }
1604 
1605 process_ack:
1606 
1607  /*
1608  * Looks okay, process feedback
1609  */
1610 
1611  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1612  tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
1613 
1614  prev_snd_wnd = tc->snd_wnd;
1615  prev_snd_una = tc->snd_una;
1616  tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
1617  vnet_buffer (b)->tcp.ack_number,
1618  clib_net_to_host_u16 (th->window) << tc->snd_wscale);
1619  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
1620  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
1621  tcp_validate_txf_size (tc, tc->bytes_acked);
1622 
1623  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1624  tcp_bt_sample_delivery_rate (tc, &rs);
1625 
1626  tcp_program_dequeue (wrk, tc);
1627 
1628  if (tc->bytes_acked)
1629  tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number);
1630 
1631  TCP_EVT (TCP_EVT_ACK_RCVD, tc);
1632 
1633  /*
1634  * Check if we have congestion event
1635  */
1636 
1637  if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
1638  {
1639  tcp_cc_handle_event (tc, &rs, is_dack);
1640  tc->dupacks_in += is_dack;
1641  if (!tcp_in_cong_recovery (tc))
1642  {
1643  *error = TCP_ERROR_ACK_OK;
1644  return 0;
1645  }
1646  *error = TCP_ERROR_ACK_DUP;
1647  if (vnet_buffer (b)->tcp.data_len || tcp_is_fin (th))
1648  return 0;
1649  return -1;
1650  }
1651 
1652  /*
1653  * Update congestion control (slow start/congestion avoidance)
1654  */
1655  tcp_cc_update (tc, &rs);
1656  *error = TCP_ERROR_ACK_OK;
1657  return 0;
1658 }
1659 
1660 static void
1662 {
1663  if (!tcp_disconnect_pending (tc))
1664  {
1665  vec_add1 (wrk->pending_disconnects, tc->c_c_index);
1667  }
1668 }
1669 
1670 static void
1672 {
1673  u32 thread_index, *pending_disconnects;
1674  tcp_connection_t *tc;
1675  int i;
1676 
1677  if (!vec_len (wrk->pending_disconnects))
1678  return;
1679 
1680  thread_index = wrk->vm->thread_index;
1681  pending_disconnects = wrk->pending_disconnects;
1682  for (i = 0; i < vec_len (pending_disconnects); i++)
1683  {
1684  tc = tcp_connection_get (pending_disconnects[i], thread_index);
1686  session_transport_closing_notify (&tc->connection);
1687  }
1688  _vec_len (wrk->pending_disconnects) = 0;
1689 }
1690 
1691 static void
1693  u32 * error)
1694 {
1695  /* Reject out-of-order fins */
1696  if (vnet_buffer (b)->tcp.seq_end != tc->rcv_nxt)
1697  return;
1698 
1699  /* Account for the FIN and send ack */
1700  tc->rcv_nxt += 1;
1701  tc->flags |= TCP_CONN_FINRCVD;
1702  tcp_program_ack (tc);
1703  /* Enter CLOSE-WAIT and notify session. To avoid lingering
1704  * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
1705  tcp_connection_set_state (tc, TCP_STATE_CLOSE_WAIT);
1706  tcp_program_disconnect (wrk, tc);
1707  tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, tcp_cfg.closewait_time);
1708  TCP_EVT (TCP_EVT_FIN_RCVD, tc);
1709  *error = TCP_ERROR_FIN_RCVD;
1710 }
1711 
1712 #ifndef CLIB_MARCH_VARIANT
1713 static u8
1715 {
1716  int i;
1717  for (i = 1; i < vec_len (sacks); i++)
1718  {
1719  if (sacks[i - 1].end == sacks[i].start)
1720  return 0;
1721  }
1722  return 1;
1723 }
1724 
1725 /**
1726  * Build SACK list as per RFC2018.
1727  *
1728  * Makes sure the first block contains the segment that generated the current
1729  * ACK and the following ones are the ones most recently reported in SACK
1730  * blocks.
1731  *
1732  * @param tc TCP connection for which the SACK list is updated
1733  * @param start Start sequence number of the newest SACK block
1734  * @param end End sequence of the newest SACK block
1735  */
1736 void
1738 {
1739  sack_block_t *new_list = tc->snd_sacks_fl, *block = 0;
1740  int i;
1741 
1742  /* If the first segment is ooo add it to the list. Last write might've moved
1743  * rcv_nxt over the first segment. */
1744  if (seq_lt (tc->rcv_nxt, start))
1745  {
1746  vec_add2 (new_list, block, 1);
1747  block->start = start;
1748  block->end = end;
1749  }
1750 
1751  /* Find the blocks still worth keeping. */
1752  for (i = 0; i < vec_len (tc->snd_sacks); i++)
1753  {
1754  /* Discard if rcv_nxt advanced beyond current block */
1755  if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
1756  continue;
1757 
1758  /* Merge or drop if segment overlapped by the new segment */
1759  if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
1760  && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
1761  {
1762  if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
1763  new_list[0].start = tc->snd_sacks[i].start;
1764  if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
1765  new_list[0].end = tc->snd_sacks[i].end;
1766  continue;
1767  }
1768 
1769  /* Save to new SACK list if we have space. */
1770  if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
1771  vec_add1 (new_list, tc->snd_sacks[i]);
1772  }
1773 
1774  ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
1775 
1776  /* Replace old vector with new one */
1777  vec_reset_length (tc->snd_sacks);
1778  tc->snd_sacks_fl = tc->snd_sacks;
1779  tc->snd_sacks = new_list;
1780 
1781  /* Segments should not 'touch' */
1782  ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
1783 }
1784 
1785 u32
1787 {
1788  u32 bytes = 0, i;
1789  for (i = 0; i < vec_len (tc->snd_sacks); i++)
1790  bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
1791  return bytes;
1792 }
1793 #endif /* CLIB_MARCH_VARIANT */
1794 
1795 /** Enqueue data for delivery to application */
1796 static int
1798  u16 data_len)
1799 {
1800  int written, error = TCP_ERROR_ENQUEUED;
1801 
1802  ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1803  ASSERT (data_len);
1804  written = session_enqueue_stream_connection (&tc->connection, b, 0,
1805  1 /* queue event */ , 1);
1806  tc->bytes_in += written;
1807 
1808  TCP_EVT (TCP_EVT_INPUT, tc, 0, data_len, written);
1809 
1810  /* Update rcv_nxt */
1811  if (PREDICT_TRUE (written == data_len))
1812  {
1813  tc->rcv_nxt += written;
1814  }
1815  /* If more data written than expected, account for out-of-order bytes. */
1816  else if (written > data_len)
1817  {
1818  tc->rcv_nxt += written;
1819  TCP_EVT (TCP_EVT_CC_INPUT, tc, data_len, written);
1820  }
1821  else if (written > 0)
1822  {
1823  /* We've written something but FIFO is probably full now */
1824  tc->rcv_nxt += written;
1825  error = TCP_ERROR_PARTIALLY_ENQUEUED;
1826  }
1827  else
1828  {
1829  return TCP_ERROR_FIFO_FULL;
1830  }
1831 
1832  /* Update SACK list if need be */
1833  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1834  {
1835  /* Remove SACK blocks that have been delivered */
1836  tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
1837  }
1838 
1839  return error;
1840 }
1841 
1842 /** Enqueue out-of-order data */
1843 static int
1845  u16 data_len)
1846 {
1847  session_t *s0;
1848  int rv, offset;
1849 
1850  ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1851  ASSERT (data_len);
1852 
1853  /* Enqueue out-of-order data with relative offset */
1854  rv = session_enqueue_stream_connection (&tc->connection, b,
1855  vnet_buffer (b)->tcp.seq_number -
1856  tc->rcv_nxt, 0 /* queue event */ ,
1857  0);
1858 
1859  /* Nothing written */
1860  if (rv)
1861  {
1862  TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, 0);
1863  return TCP_ERROR_FIFO_FULL;
1864  }
1865 
1866  TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, data_len);
1867  tc->bytes_in += data_len;
1868 
1869  /* Update SACK list if in use */
1870  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1871  {
1872  ooo_segment_t *newest;
1873  u32 start, end;
1874 
1875  s0 = session_get (tc->c_s_index, tc->c_thread_index);
1876 
1877  /* Get the newest segment from the fifo */
1878  newest = svm_fifo_newest_ooo_segment (s0->rx_fifo);
1879  if (newest)
1880  {
1881  offset = ooo_segment_offset_prod (s0->rx_fifo, newest);
1882  ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
1883  start = tc->rcv_nxt + offset;
1884  end = start + ooo_segment_length (s0->rx_fifo, newest);
1885  tcp_update_sack_list (tc, start, end);
1887  TCP_EVT (TCP_EVT_CC_SACKS, tc);
1888  }
1889  }
1890 
1891  return TCP_ERROR_ENQUEUED_OOO;
1892 }
1893 
1894 /**
1895  * Check if ACK could be delayed. If ack can be delayed, it should return
1896  * true for a full frame. If we're always acking return 0.
1897  */
1898 always_inline int
1900 {
1901  /* Send ack if ... */
1902  if (TCP_ALWAYS_ACK
1903  /* just sent a rcv wnd 0
1904  || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 */
1905  /* constrained to send ack */
1906  || (tc->flags & TCP_CONN_SNDACK) != 0
1907  /* we're almost out of tx wnd */
1908  || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss)
1909  return 0;
1910 
1911  return 1;
1912 }
1913 
1914 static int
1916 {
1917  u32 discard, first = b->current_length;
1918  vlib_main_t *vm = vlib_get_main ();
1919 
1920  /* Handle multi-buffer segments */
1921  if (n_bytes_to_drop > b->current_length)
1922  {
1923  if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
1924  return -1;
1925  do
1926  {
1927  discard = clib_min (n_bytes_to_drop, b->current_length);
1928  vlib_buffer_advance (b, discard);
1929  b = vlib_get_buffer (vm, b->next_buffer);
1930  n_bytes_to_drop -= discard;
1931  }
1932  while (n_bytes_to_drop);
1933  if (n_bytes_to_drop > first)
1934  b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
1935  }
1936  else
1937  vlib_buffer_advance (b, n_bytes_to_drop);
1938  vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
1939  return 0;
1940 }
1941 
1942 /**
1943  * Receive buffer for connection and handle acks
1944  *
1945  * It handles both in order or out-of-order data.
1946  */
1947 static int
1949  vlib_buffer_t * b)
1950 {
1951  u32 error, n_bytes_to_drop, n_data_bytes;
1952 
1953  vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
1954  n_data_bytes = vnet_buffer (b)->tcp.data_len;
1955  ASSERT (n_data_bytes);
1956  tc->data_segs_in += 1;
1957 
1958  /* Handle out-of-order data */
1959  if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
1960  {
1961  /* Old sequence numbers allowed through because they overlapped
1962  * the rx window */
1963  if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
1964  {
1965  /* Completely in the past (possible retransmit). Ack
1966  * retransmissions since we may not have any data to send */
1967  if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
1968  {
1969  tcp_program_ack (tc);
1970  error = TCP_ERROR_SEGMENT_OLD;
1971  goto done;
1972  }
1973 
1974  /* Chop off the bytes in the past and see if what is left
1975  * can be enqueued in order */
1976  n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
1977  n_data_bytes -= n_bytes_to_drop;
1978  vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
1979  if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
1980  {
1981  error = TCP_ERROR_SEGMENT_OLD;
1982  goto done;
1983  }
1984  goto in_order;
1985  }
1986 
1987  /* RFC2581: Enqueue and send DUPACK for fast retransmit */
1988  error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
1989  tcp_program_dupack (tc);
1990  TCP_EVT (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
1991  tc->errors.above_data_wnd += seq_gt (vnet_buffer (b)->tcp.seq_end,
1992  tc->rcv_las + tc->rcv_wnd);
1993  goto done;
1994  }
1995 
1996 in_order:
1997 
1998  /* In order data, enqueue. Fifo figures out by itself if any out-of-order
1999  * segments can be enqueued after fifo tail offset changes. */
2000  error = tcp_session_enqueue_data (tc, b, n_data_bytes);
2001  if (tcp_can_delack (tc))
2002  {
2003  if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
2004  tcp_timer_set (tc, TCP_TIMER_DELACK, tcp_cfg.delack_time);
2005  goto done;
2006  }
2007 
2008  tcp_program_ack (tc);
2009 
2010 done:
2011  return error;
2012 }
2013 
2014 typedef struct
2015 {
2018 } tcp_rx_trace_t;
2019 
2020 static u8 *
2021 format_tcp_rx_trace (u8 * s, va_list * args)
2022 {
2023  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
2024  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
2025  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
2026  u32 indent = format_get_indent (s);
2027 
2028  s = format (s, "%U\n%U%U",
2029  format_tcp_header, &t->tcp_header, 128,
2030  format_white_space, indent,
2032 
2033  return s;
2034 }
2035 
2036 static u8 *
2037 format_tcp_rx_trace_short (u8 * s, va_list * args)
2038 {
2039  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
2040  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
2041  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
2042 
2043  s = format (s, "%d -> %d (%U)",
2044  clib_net_to_host_u16 (t->tcp_header.dst_port),
2045  clib_net_to_host_u16 (t->tcp_header.src_port), format_tcp_state,
2046  t->tcp_connection.state);
2047 
2048  return s;
2049 }
2050 
2051 static void
2053  tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
2054 {
2055  if (tc0)
2056  {
2057  clib_memcpy_fast (&t0->tcp_connection, tc0,
2058  sizeof (t0->tcp_connection));
2059  }
2060  else
2061  {
2062  th0 = tcp_buffer_hdr (b0);
2063  }
2064  clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
2065 }
2066 
2067 static void
2069  vlib_frame_t * frame, u8 is_ip4)
2070 {
2071  u32 *from, n_left;
2072 
2073  n_left = frame->n_vectors;
2074  from = vlib_frame_vector_args (frame);
2075 
2076  while (n_left >= 1)
2077  {
2078  tcp_connection_t *tc0;
2079  tcp_rx_trace_t *t0;
2080  tcp_header_t *th0;
2081  vlib_buffer_t *b0;
2082  u32 bi0;
2083 
2084  bi0 = from[0];
2085  b0 = vlib_get_buffer (vm, bi0);
2086 
2087  if (b0->flags & VLIB_BUFFER_IS_TRACED)
2088  {
2089  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2090  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2091  vm->thread_index);
2092  th0 = tcp_buffer_hdr (b0);
2093  tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
2094  }
2095 
2096  from += 1;
2097  n_left -= 1;
2098  }
2099 }
2100 
2101 always_inline void
2102 tcp_node_inc_counter_i (vlib_main_t * vm, u32 tcp4_node, u32 tcp6_node,
2103  u8 is_ip4, u32 evt, u32 val)
2104 {
2105  if (is_ip4)
2106  vlib_node_increment_counter (vm, tcp4_node, evt, val);
2107  else
2108  vlib_node_increment_counter (vm, tcp6_node, evt, val);
2109 }
2110 
2111 #define tcp_maybe_inc_counter(node_id, err, count) \
2112 { \
2113  if (next0 != tcp_next_drop (is_ip4)) \
2114  tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
2115  tcp6_##node_id##_node.index, is_ip4, err, \
2116  1); \
2117 }
2118 #define tcp_inc_counter(node_id, err, count) \
2119  tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
2120  tcp6_##node_id##_node.index, is_ip4, \
2121  err, count)
2122 #define tcp_maybe_inc_err_counter(cnts, err) \
2123 { \
2124  cnts[err] += (next0 != tcp_next_drop (is_ip4)); \
2125 }
2126 #define tcp_inc_err_counter(cnts, err, val) \
2127 { \
2128  cnts[err] += val; \
2129 }
2130 #define tcp_store_err_counters(node_id, cnts) \
2131 { \
2132  int i; \
2133  for (i = 0; i < TCP_N_ERROR; i++) \
2134  if (cnts[i]) \
2135  tcp_inc_counter(node_id, i, cnts[i]); \
2136 }
2137 
2138 
2141  vlib_frame_t * frame, int is_ip4)
2142 {
2143  u32 thread_index = vm->thread_index, errors = 0;
2144  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
2145  u32 n_left_from, *from, *first_buffer;
2146  u16 err_counters[TCP_N_ERROR] = { 0 };
2147 
2148  if (node->flags & VLIB_NODE_FLAG_TRACE)
2149  tcp_established_trace_frame (vm, node, frame, is_ip4);
2150 
2151  first_buffer = from = vlib_frame_vector_args (frame);
2152  n_left_from = frame->n_vectors;
2153 
2154  while (n_left_from > 0)
2155  {
2156  u32 bi0, error0 = TCP_ERROR_ACK_OK;
2157  vlib_buffer_t *b0;
2158  tcp_header_t *th0;
2159  tcp_connection_t *tc0;
2160 
2161  if (n_left_from > 1)
2162  {
2163  vlib_buffer_t *pb;
2164  pb = vlib_get_buffer (vm, from[1]);
2165  vlib_prefetch_buffer_header (pb, LOAD);
2166  CLIB_PREFETCH (pb->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
2167  }
2168 
2169  bi0 = from[0];
2170  from += 1;
2171  n_left_from -= 1;
2172 
2173  b0 = vlib_get_buffer (vm, bi0);
2174  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2175  thread_index);
2176 
2177  if (PREDICT_FALSE (tc0 == 0))
2178  {
2179  error0 = TCP_ERROR_INVALID_CONNECTION;
2180  goto done;
2181  }
2182 
2183  th0 = tcp_buffer_hdr (b0);
2184 
2185  /* TODO header prediction fast path */
2186 
2187  /* 1-4: check SEQ, RST, SYN */
2188  if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, th0, &error0)))
2189  {
2190  TCP_EVT (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp);
2191  goto done;
2192  }
2193 
2194  /* 5: check the ACK field */
2195  if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &error0)))
2196  goto done;
2197 
2198  /* 6: check the URG bit TODO */
2199 
2200  /* 7: process the segment text */
2201  if (vnet_buffer (b0)->tcp.data_len)
2202  error0 = tcp_segment_rcv (wrk, tc0, b0);
2203 
2204  /* 8: check the FIN bit */
2205  if (PREDICT_FALSE (tcp_is_fin (th0)))
2206  tcp_rcv_fin (wrk, tc0, b0, &error0);
2207 
2208  done:
2209  tcp_inc_err_counter (err_counters, error0, 1);
2210  }
2211 
2212  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
2213  thread_index);
2214  err_counters[TCP_ERROR_MSG_QUEUE_FULL] = errors;
2215  tcp_store_err_counters (established, err_counters);
2217  tcp_handle_disconnects (wrk);
2218  vlib_buffer_free (vm, first_buffer, frame->n_vectors);
2219 
2220  return frame->n_vectors;
2221 }
2222 
2224  vlib_node_runtime_t * node,
2225  vlib_frame_t * from_frame)
2226 {
2227  return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2228 }
2229 
2231  vlib_node_runtime_t * node,
2232  vlib_frame_t * from_frame)
2233 {
2234  return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2235 }
2236 
2237 /* *INDENT-OFF* */
2239 {
2240  .name = "tcp4-established",
2241  /* Takes a vector of packets. */
2242  .vector_size = sizeof (u32),
2243  .n_errors = TCP_N_ERROR,
2244  .error_strings = tcp_error_strings,
2245  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
2246  .next_nodes =
2247  {
2248 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
2250 #undef _
2251  },
2252  .format_trace = format_tcp_rx_trace_short,
2253 };
2254 /* *INDENT-ON* */
2255 
2256 /* *INDENT-OFF* */
2258 {
2259  .name = "tcp6-established",
2260  /* Takes a vector of packets. */
2261  .vector_size = sizeof (u32),
2262  .n_errors = TCP_N_ERROR,
2263  .error_strings = tcp_error_strings,
2264  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
2265  .next_nodes =
2266  {
2267 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
2269 #undef _
2270  },
2271  .format_trace = format_tcp_rx_trace_short,
2272 };
2273 /* *INDENT-ON* */
2274 
2275 
2276 static u8
2278  tcp_header_t * hdr)
2279 {
2280  transport_connection_t *tmp = 0;
2281  u64 handle;
2282 
2283  if (!tc)
2284  return 1;
2285 
2286  /* Proxy case */
2287  if (tc->c_lcl_port == 0 && tc->state == TCP_STATE_LISTEN)
2288  return 1;
2289 
2290  u8 is_ip_valid = 0, val_l, val_r;
2291 
2292  if (tc->connection.is_ip4)
2293  {
2295 
2296  val_l = !ip4_address_compare (&ip4_hdr->dst_address,
2297  &tc->connection.lcl_ip.ip4);
2298  val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 1);
2299  val_r = !ip4_address_compare (&ip4_hdr->src_address,
2300  &tc->connection.rmt_ip.ip4);
2301  val_r = val_r || tc->state == TCP_STATE_LISTEN;
2302  is_ip_valid = val_l && val_r;
2303  }
2304  else
2305  {
2307 
2308  val_l = !ip6_address_compare (&ip6_hdr->dst_address,
2309  &tc->connection.lcl_ip.ip6);
2310  val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 0);
2311  val_r = !ip6_address_compare (&ip6_hdr->src_address,
2312  &tc->connection.rmt_ip.ip6);
2313  val_r = val_r || tc->state == TCP_STATE_LISTEN;
2314  is_ip_valid = val_l && val_r;
2315  }
2316 
2317  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
2318  && (tc->state == TCP_STATE_LISTEN
2319  || tc->c_rmt_port == hdr->src_port) && is_ip_valid);
2320 
2321  if (!is_valid)
2322  {
2323  handle = session_lookup_half_open_handle (&tc->connection);
2324  tmp = session_lookup_half_open_connection (handle & 0xFFFFFFFF,
2325  tc->c_proto, tc->c_is_ip4);
2326 
2327  if (tmp)
2328  {
2329  if (tmp->lcl_port == hdr->dst_port
2330  && tmp->rmt_port == hdr->src_port)
2331  {
2332  TCP_DBG ("half-open is valid!");
2333  is_valid = 1;
2334  }
2335  }
2336  }
2337  return is_valid;
2338 }
2339 
2340 /**
2341  * Lookup transport connection
2342  */
2343 static tcp_connection_t *
2344 tcp_lookup_connection (u32 fib_index, vlib_buffer_t * b, u8 thread_index,
2345  u8 is_ip4)
2346 {
2347  tcp_header_t *tcp;
2348  transport_connection_t *tconn;
2349  tcp_connection_t *tc;
2350  u8 is_filtered = 0;
2351  if (is_ip4)
2352  {
2353  ip4_header_t *ip4;
2354  ip4 = vlib_buffer_get_current (b);
2355  tcp = ip4_next_header (ip4);
2356  tconn = session_lookup_connection_wt4 (fib_index,
2357  &ip4->dst_address,
2358  &ip4->src_address,
2359  tcp->dst_port,
2360  tcp->src_port,
2361  TRANSPORT_PROTO_TCP,
2362  thread_index, &is_filtered);
2363  tc = tcp_get_connection_from_transport (tconn);
2364  ASSERT (tcp_lookup_is_valid (tc, b, tcp));
2365  }
2366  else
2367  {
2368  ip6_header_t *ip6;
2369  ip6 = vlib_buffer_get_current (b);
2370  tcp = ip6_next_header (ip6);
2371  tconn = session_lookup_connection_wt6 (fib_index,
2372  &ip6->dst_address,
2373  &ip6->src_address,
2374  tcp->dst_port,
2375  tcp->src_port,
2376  TRANSPORT_PROTO_TCP,
2377  thread_index, &is_filtered);
2378  tc = tcp_get_connection_from_transport (tconn);
2379  ASSERT (tcp_lookup_is_valid (tc, b, tcp));
2380  }
2381  return tc;
2382 }
2383 
2384 always_inline void
2386 {
2387  vnet_main_t *vnm = vnet_get_main ();
2388  const dpo_id_t *dpo;
2389  const load_balance_t *lb;
2390  vnet_hw_interface_t *hw_if;
2391  u32 sw_if_idx, lb_idx;
2392 
2393  if (is_ipv4)
2394  {
2395  ip4_address_t *dst_addr = &(tc->c_rmt_ip.ip4);
2396  lb_idx = ip4_fib_forwarding_lookup (tc->c_fib_index, dst_addr);
2397  }
2398  else
2399  {
2400  ip6_address_t *dst_addr = &(tc->c_rmt_ip.ip6);
2401  lb_idx = ip6_fib_table_fwding_lookup (tc->c_fib_index, dst_addr);
2402  }
2403 
2404  lb = load_balance_get (lb_idx);
2405  dpo = load_balance_get_bucket_i (lb, 0);
2406 
2407  sw_if_idx = dpo->dpoi_index;
2408  hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
2409 
2411  tc->cfg_flags |= TCP_CFG_F_TSO;
2412 }
2413 
2416  vlib_frame_t * from_frame, int is_ip4)
2417 {
2418  u32 n_left_from, *from, *first_buffer, errors = 0;
2419  u32 my_thread_index = vm->thread_index;
2420  tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index);
2421 
2422  from = first_buffer = vlib_frame_vector_args (from_frame);
2423  n_left_from = from_frame->n_vectors;
2424 
2425  while (n_left_from > 0)
2426  {
2427  u32 bi0, ack0, seq0, error0 = TCP_ERROR_NONE;
2428  tcp_connection_t *tc0, *new_tc0;
2429  tcp_header_t *tcp0 = 0;
2430  tcp_rx_trace_t *t0;
2431  vlib_buffer_t *b0;
2432 
2433  bi0 = from[0];
2434  from += 1;
2435  n_left_from -= 1;
2436 
2437  b0 = vlib_get_buffer (vm, bi0);
2438  tc0 =
2439  tcp_half_open_connection_get (vnet_buffer (b0)->tcp.connection_index);
2440  if (PREDICT_FALSE (tc0 == 0))
2441  {
2442  error0 = TCP_ERROR_INVALID_CONNECTION;
2443  goto drop;
2444  }
2445 
2446  /* Half-open completed recently but the connection was't removed
2447  * yet by the owning thread */
2448  if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
2449  {
2450  /* Make sure the connection actually exists */
2451  ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0,
2452  my_thread_index, is_ip4));
2453  error0 = TCP_ERROR_SPURIOUS_SYN_ACK;
2454  goto drop;
2455  }
2456 
2457  ack0 = vnet_buffer (b0)->tcp.ack_number;
2458  seq0 = vnet_buffer (b0)->tcp.seq_number;
2459  tcp0 = tcp_buffer_hdr (b0);
2460 
2461  /* Crude check to see if the connection handle does not match
2462  * the packet. Probably connection just switched to established */
2463  if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
2464  || tcp0->src_port != tc0->c_rmt_port))
2465  {
2466  error0 = TCP_ERROR_INVALID_CONNECTION;
2467  goto drop;
2468  }
2469 
2470  if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0)
2471  && !tcp_syn (tcp0)))
2472  {
2473  error0 = TCP_ERROR_SEGMENT_INVALID;
2474  goto drop;
2475  }
2476 
2477  /* SYNs consume sequence numbers */
2478  vnet_buffer (b0)->tcp.seq_end += tcp_is_syn (tcp0);
2479 
2480  /*
2481  * 1. check the ACK bit
2482  */
2483 
2484  /*
2485  * If the ACK bit is set
2486  * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
2487  * the RST bit is set, if so drop the segment and return)
2488  * <SEQ=SEG.ACK><CTL=RST>
2489  * and discard the segment. Return.
2490  * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
2491  */
2492  if (tcp_ack (tcp0))
2493  {
2494  if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
2495  {
2496  if (!tcp_rst (tcp0))
2497  tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
2498  error0 = TCP_ERROR_RCV_WND;
2499  goto drop;
2500  }
2501 
2502  /* Make sure ACK is valid */
2503  if (seq_gt (tc0->snd_una, ack0))
2504  {
2505  error0 = TCP_ERROR_ACK_INVALID;
2506  goto drop;
2507  }
2508  }
2509 
2510  /*
2511  * 2. check the RST bit
2512  */
2513 
2514  if (tcp_rst (tcp0))
2515  {
2516  /* If ACK is acceptable, signal client that peer is not
2517  * willing to accept connection and drop connection*/
2518  if (tcp_ack (tcp0))
2519  tcp_connection_reset (tc0);
2520  error0 = TCP_ERROR_RST_RCVD;
2521  goto drop;
2522  }
2523 
2524  /*
2525  * 3. check the security and precedence (skipped)
2526  */
2527 
2528  /*
2529  * 4. check the SYN bit
2530  */
2531 
2532  /* No SYN flag. Drop. */
2533  if (!tcp_syn (tcp0))
2534  {
2535  error0 = TCP_ERROR_SEGMENT_INVALID;
2536  goto drop;
2537  }
2538 
2539  /* Parse options */
2540  if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1))
2541  {
2542  error0 = TCP_ERROR_OPTIONS;
2543  goto drop;
2544  }
2545 
2546  /* Valid SYN or SYN-ACK. Move connection from half-open pool to
2547  * current thread pool. */
2548  new_tc0 = tcp_connection_alloc_w_base (my_thread_index, tc0);
2549  new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
2550  new_tc0->irs = seq0;
2551  new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
2552  new_tc0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
2553 
2554  /* If this is not the owning thread, wait for syn retransmit to
2555  * expire and cleanup then */
2557  tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
2558 
2559  if (tcp_opts_tstamp (&new_tc0->rcv_opts))
2560  {
2561  new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
2562  new_tc0->tsval_recent_age = tcp_time_now ();
2563  }
2564 
2565  if (tcp_opts_wscale (&new_tc0->rcv_opts))
2566  new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
2567  else
2568  new_tc0->rcv_wscale = 0;
2569 
2570  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2571  << new_tc0->snd_wscale;
2572  new_tc0->snd_wl1 = seq0;
2573  new_tc0->snd_wl2 = ack0;
2574 
2575  tcp_connection_init_vars (new_tc0);
2576 
2577  /* SYN-ACK: See if we can switch to ESTABLISHED state */
2578  if (PREDICT_TRUE (tcp_ack (tcp0)))
2579  {
2580  /* Our SYN is ACKed: we have iss < ack = snd_una */
2581 
2582  /* TODO Dequeue acknowledged segments if we support Fast Open */
2583  new_tc0->snd_una = ack0;
2584  new_tc0->state = TCP_STATE_ESTABLISHED;
2585 
2586  /* Make sure las is initialized for the wnd computation */
2587  new_tc0->rcv_las = new_tc0->rcv_nxt;
2588 
2589  /* Notify app that we have connection. If session layer can't
2590  * allocate session send reset */
2591  if (session_stream_connect_notify (&new_tc0->connection, 0))
2592  {
2593  tcp_send_reset_w_pkt (new_tc0, b0, my_thread_index, is_ip4);
2594  tcp_connection_cleanup (new_tc0);
2595  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2596  goto drop;
2597  }
2598 
2599  new_tc0->tx_fifo_size =
2600  transport_tx_fifo_size (&new_tc0->connection);
2601  /* Update rtt with the syn-ack sample */
2602  tcp_estimate_initial_rtt (new_tc0);
2603  TCP_EVT (TCP_EVT_SYNACK_RCVD, new_tc0);
2604  error0 = TCP_ERROR_SYN_ACKS_RCVD;
2605  }
2606  /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
2607  else
2608  {
2609  new_tc0->state = TCP_STATE_SYN_RCVD;
2610 
2611  /* Notify app that we have connection */
2612  if (session_stream_connect_notify (&new_tc0->connection, 0))
2613  {
2614  tcp_connection_cleanup (new_tc0);
2615  tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
2616  TCP_EVT (TCP_EVT_RST_SENT, tc0);
2617  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2618  goto drop;
2619  }
2620 
2621  new_tc0->tx_fifo_size =
2622  transport_tx_fifo_size (&new_tc0->connection);
2623  new_tc0->rtt_ts = 0;
2624  tcp_init_snd_vars (new_tc0);
2625  tcp_send_synack (new_tc0);
2626  error0 = TCP_ERROR_SYNS_RCVD;
2627  goto drop;
2628  }
2629 
2630  if (!(new_tc0->cfg_flags & TCP_CFG_F_NO_TSO))
2631  tcp_check_tx_offload (new_tc0, is_ip4);
2632 
2633  /* Read data, if any */
2634  if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
2635  {
2636  clib_warning ("rcvd data in syn-sent");
2637  error0 = tcp_segment_rcv (wrk, new_tc0, b0);
2638  if (error0 == TCP_ERROR_ACK_OK)
2639  error0 = TCP_ERROR_SYN_ACKS_RCVD;
2640  }
2641  else
2642  {
2643  /* Send ack now instead of programming it because connection was
2644  * just established and it's not optional. */
2645  tcp_send_ack (new_tc0);
2646  }
2647 
2648  drop:
2649 
2650  tcp_inc_counter (syn_sent, error0, 1);
2651  if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0))
2652  {
2653  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2654  clib_memcpy_fast (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
2655  clib_memcpy_fast (&t0->tcp_connection, tc0,
2656  sizeof (t0->tcp_connection));
2657  }
2658  }
2659 
2660  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
2661  my_thread_index);
2662  tcp_inc_counter (syn_sent, TCP_ERROR_MSG_QUEUE_FULL, errors);
2663  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
2664 
2665  return from_frame->n_vectors;
2666 }
2667 
2669  vlib_node_runtime_t * node,
2670  vlib_frame_t * from_frame)
2671 {
2672  return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2673 }
2674 
2676  vlib_node_runtime_t * node,
2677  vlib_frame_t * from_frame)
2678 {
2679  return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2680 }
2681 
2682 /* *INDENT-OFF* */
2684 {
2685  .name = "tcp4-syn-sent",
2686  /* Takes a vector of packets. */
2687  .vector_size = sizeof (u32),
2688  .n_errors = TCP_N_ERROR,
2689  .error_strings = tcp_error_strings,
2690  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2691  .next_nodes =
2692  {
2693 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2695 #undef _
2696  },
2697  .format_trace = format_tcp_rx_trace_short,
2698 };
2699 /* *INDENT-ON* */
2700 
2701 /* *INDENT-OFF* */
2703 {
2704  .name = "tcp6-syn-sent",
2705  /* Takes a vector of packets. */
2706  .vector_size = sizeof (u32),
2707  .n_errors = TCP_N_ERROR,
2708  .error_strings = tcp_error_strings,
2709  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2710  .next_nodes =
2711  {
2712 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2714 #undef _
2715  },
2716  .format_trace = format_tcp_rx_trace_short,
2717 };
2718 /* *INDENT-ON* */
2719 
2720 /**
2721  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
2722  * as per RFC793 p. 64
2723  */
2726  vlib_frame_t * from_frame, int is_ip4)
2727 {
2728  u32 thread_index = vm->thread_index, errors = 0, *first_buffer;
2729  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
2730  u32 n_left_from, *from, max_dequeue;
2731 
2732  from = first_buffer = vlib_frame_vector_args (from_frame);
2733  n_left_from = from_frame->n_vectors;
2734 
2735  while (n_left_from > 0)
2736  {
2737  u32 bi0, error0 = TCP_ERROR_NONE;
2738  tcp_header_t *tcp0 = 0;
2739  tcp_connection_t *tc0;
2740  vlib_buffer_t *b0;
2741  u8 is_fin0;
2742 
2743  bi0 = from[0];
2744  from += 1;
2745  n_left_from -= 1;
2746 
2747  b0 = vlib_get_buffer (vm, bi0);
2748  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2749  thread_index);
2750  if (PREDICT_FALSE (tc0 == 0))
2751  {
2752  error0 = TCP_ERROR_INVALID_CONNECTION;
2753  goto drop;
2754  }
2755 
2756  tcp0 = tcp_buffer_hdr (b0);
2757  is_fin0 = tcp_is_fin (tcp0);
2758 
2759  if (CLIB_DEBUG)
2760  {
2761  if (!(tc0->connection.flags & TRANSPORT_CONNECTION_F_NO_LOOKUP))
2762  {
2763  tcp_connection_t *tmp;
2764  tmp = tcp_lookup_connection (tc0->c_fib_index, b0, thread_index,
2765  is_ip4);
2766  if (tmp->state != tc0->state)
2767  {
2768  if (tc0->state != TCP_STATE_CLOSED)
2769  clib_warning ("state changed");
2770  goto drop;
2771  }
2772  }
2773  }
2774 
2775  /*
2776  * Special treatment for CLOSED
2777  */
2778  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
2779  {
2780  error0 = TCP_ERROR_CONNECTION_CLOSED;
2781  goto drop;
2782  }
2783 
2784  /*
2785  * For all other states (except LISTEN)
2786  */
2787 
2788  /* 1-4: check SEQ, RST, SYN */
2789  if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, tcp0, &error0)))
2790  goto drop;
2791 
2792  /* 5: check the ACK field */
2793  switch (tc0->state)
2794  {
2795  case TCP_STATE_SYN_RCVD:
2796 
2797  /* Make sure the segment is exactly right */
2798  if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
2799  {
2800  tcp_connection_reset (tc0);
2801  error0 = TCP_ERROR_SEGMENT_INVALID;
2802  goto drop;
2803  }
2804 
2805  /*
2806  * If the segment acknowledgment is not acceptable, form a
2807  * reset segment,
2808  * <SEQ=SEG.ACK><CTL=RST>
2809  * and send it.
2810  */
2811  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2812  {
2813  tcp_connection_reset (tc0);
2814  goto drop;
2815  }
2816 
2817  /* Update rtt and rto */
2820 
2821  /* Switch state to ESTABLISHED */
2822  tc0->state = TCP_STATE_ESTABLISHED;
2823  TCP_EVT (TCP_EVT_STATE_CHANGE, tc0);
2824 
2825  if (!(tc0->cfg_flags & TCP_CFG_F_NO_TSO))
2826  tcp_check_tx_offload (tc0, is_ip4);
2827 
2828  /* Initialize session variables */
2829  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
2830  tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2831  << tc0->rcv_opts.wscale;
2832  tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2833  tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2834 
2835  /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
2837  if (session_stream_accept_notify (&tc0->connection))
2838  {
2839  error0 = TCP_ERROR_MSG_QUEUE_FULL;
2840  tcp_connection_reset (tc0);
2841  goto drop;
2842  }
2843  error0 = TCP_ERROR_ACK_OK;
2844  break;
2845  case TCP_STATE_ESTABLISHED:
2846  /* We can get packets in established state here because they
2847  * were enqueued before state change */
2848  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2849  goto drop;
2850 
2851  break;
2852  case TCP_STATE_FIN_WAIT_1:
2853  /* In addition to the processing for the ESTABLISHED state, if
2854  * our FIN is now acknowledged then enter FIN-WAIT-2 and
2855  * continue processing in that state. */
2856  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2857  goto drop;
2858 
2859  /* Still have to send the FIN */
2860  if (tc0->flags & TCP_CONN_FINPNDG)
2861  {
2862  /* TX fifo finally drained */
2863  max_dequeue = transport_max_tx_dequeue (&tc0->connection);
2864  if (max_dequeue <= tc0->burst_acked)
2865  tcp_send_fin (tc0);
2866  /* If a fin was received and data was acked extend wait */
2867  else if ((tc0->flags & TCP_CONN_FINRCVD) && tc0->bytes_acked)
2868  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
2869  tcp_cfg.closewait_time);
2870  }
2871  /* If FIN is ACKed */
2872  else if (tc0->snd_una == tc0->snd_nxt)
2873  {
2874  /* Stop all retransmit timers because we have nothing more
2875  * to send. */
2877 
2878  /* We already have a FIN but didn't transition to CLOSING
2879  * because of outstanding tx data. Close the connection. */
2880  if (tc0->flags & TCP_CONN_FINRCVD)
2881  {
2882  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
2883  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE,
2884  tcp_cfg.cleanup_time);
2885  session_transport_closed_notify (&tc0->connection);
2886  goto drop;
2887  }
2888 
2889  tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
2890  /* Enable waitclose because we're willing to wait for peer's
2891  * FIN but not indefinitely. */
2892  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.finwait2_time);
2893 
2894  /* Don't try to deq the FIN acked */
2895  if (tc0->burst_acked > 1)
2896  session_tx_fifo_dequeue_drop (&tc0->connection,
2897  tc0->burst_acked - 1);
2898  tc0->burst_acked = 0;
2899  }
2900  break;
2901  case TCP_STATE_FIN_WAIT_2:
2902  /* In addition to the processing for the ESTABLISHED state, if
2903  * the retransmission queue is empty, the user's CLOSE can be
2904  * acknowledged ("ok") but do not delete the TCB. */
2905  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2906  goto drop;
2907  tc0->burst_acked = 0;
2908  break;
2909  case TCP_STATE_CLOSE_WAIT:
2910  /* Do the same processing as for the ESTABLISHED state. */
2911  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2912  goto drop;
2913 
2914  if (!(tc0->flags & TCP_CONN_FINPNDG))
2915  break;
2916 
2917  /* Still have outstanding tx data */
2918  max_dequeue = transport_max_tx_dequeue (&tc0->connection);
2919  if (max_dequeue > tc0->burst_acked)
2920  break;
2921 
2922  tcp_send_fin (tc0);
2924  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
2925  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
2926  break;
2927  case TCP_STATE_CLOSING:
2928  /* In addition to the processing for the ESTABLISHED state, if
2929  * the ACK acknowledges our FIN then enter the TIME-WAIT state,
2930  * otherwise ignore the segment. */
2931  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2932  goto drop;
2933 
2934  if (tc0->snd_una != tc0->snd_nxt)
2935  goto drop;
2936 
2938  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
2939  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
2940  session_transport_closed_notify (&tc0->connection);
2941  goto drop;
2942 
2943  break;
2944  case TCP_STATE_LAST_ACK:
2945  /* The only thing that [should] arrive in this state is an
2946  * acknowledgment of our FIN. If our FIN is now acknowledged,
2947  * delete the TCB, enter the CLOSED state, and return. */
2948 
2949  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2950  goto drop;
2951 
2952  /* Apparently our ACK for the peer's FIN was lost */
2953  if (is_fin0 && tc0->snd_una != tc0->snd_nxt)
2954  {
2955  tcp_send_fin (tc0);
2956  goto drop;
2957  }
2958 
2959  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
2960  session_transport_closed_notify (&tc0->connection);
2961 
2962  /* Don't free the connection from the data path since
2963  * we can't ensure that we have no packets already enqueued
2964  * to output. Rely instead on the waitclose timer */
2966  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.cleanup_time);
2967 
2968  goto drop;
2969 
2970  break;
2971  case TCP_STATE_TIME_WAIT:
2972  /* The only thing that can arrive in this state is a
2973  * retransmission of the remote FIN. Acknowledge it, and restart
2974  * the 2 MSL timeout. */
2975 
2976  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2977  goto drop;
2978 
2979  if (!is_fin0)
2980  goto drop;
2981 
2982  tcp_program_ack (tc0);
2983  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
2984  goto drop;
2985 
2986  break;
2987  default:
2988  ASSERT (0);
2989  }
2990 
2991  /* 6: check the URG bit TODO */
2992 
2993  /* 7: process the segment text */
2994  switch (tc0->state)
2995  {
2996  case TCP_STATE_ESTABLISHED:
2997  case TCP_STATE_FIN_WAIT_1:
2998  case TCP_STATE_FIN_WAIT_2:
2999  if (vnet_buffer (b0)->tcp.data_len)
3000  error0 = tcp_segment_rcv (wrk, tc0, b0);
3001  break;
3002  case TCP_STATE_CLOSE_WAIT:
3003  case TCP_STATE_CLOSING:
3004  case TCP_STATE_LAST_ACK:
3005  case TCP_STATE_TIME_WAIT:
3006  /* This should not occur, since a FIN has been received from the
3007  * remote side. Ignore the segment text. */
3008  break;
3009  }
3010 
3011  /* 8: check the FIN bit */
3012  if (!is_fin0)
3013  goto drop;
3014 
3015  TCP_EVT (TCP_EVT_FIN_RCVD, tc0);
3016 
3017  switch (tc0->state)
3018  {
3019  case TCP_STATE_ESTABLISHED:
3020  /* Account for the FIN and send ack */
3021  tc0->rcv_nxt += 1;
3022  tcp_program_ack (tc0);
3023  tcp_connection_set_state (tc0, TCP_STATE_CLOSE_WAIT);
3024  tcp_program_disconnect (wrk, tc0);
3025  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.closewait_time);
3026  break;
3027  case TCP_STATE_SYN_RCVD:
3028  /* Send FIN-ACK, enter LAST-ACK and because the app was not
3029  * notified yet, set a cleanup timer instead of relying on
3030  * disconnect notify and the implicit close call. */
3032  tc0->rcv_nxt += 1;
3033  tcp_send_fin (tc0);
3034  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
3035  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
3036  break;
3037  case TCP_STATE_CLOSE_WAIT:
3038  case TCP_STATE_CLOSING:
3039  case TCP_STATE_LAST_ACK:
3040  /* move along .. */
3041  break;
3042  case TCP_STATE_FIN_WAIT_1:
3043  tc0->rcv_nxt += 1;
3044 
3045  if (tc0->flags & TCP_CONN_FINPNDG)
3046  {
3047  /* If data is outstanding, stay in FIN_WAIT_1 and try to finish
3048  * sending it. Since we already received a fin, do not wait
3049  * for too long. */
3050  tc0->flags |= TCP_CONN_FINRCVD;
3051  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
3052  tcp_cfg.closewait_time);
3053  }
3054  else
3055  {
3056  tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
3057  tcp_program_ack (tc0);
3058  /* Wait for ACK for our FIN but not forever */
3059  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
3060  tcp_cfg.closing_time);
3061  }
3062  break;
3063  case TCP_STATE_FIN_WAIT_2:
3064  /* Got FIN, send ACK! Be more aggressive with resource cleanup */
3065  tc0->rcv_nxt += 1;
3066  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
3068  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
3069  tcp_program_ack (tc0);
3070  session_transport_closed_notify (&tc0->connection);
3071  break;
3072  case TCP_STATE_TIME_WAIT:
3073  /* Remain in the TIME-WAIT state. Restart the time-wait
3074  * timeout.
3075  */
3076  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
3077  break;
3078  }
3079  error0 = TCP_ERROR_FIN_RCVD;
3080 
3081  drop:
3082 
3083  tcp_inc_counter (rcv_process, error0, 1);
3084  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
3085  {
3086  tcp_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
3087  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
3088  }
3089  }
3090 
3091  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
3092  thread_index);
3093  tcp_inc_counter (rcv_process, TCP_ERROR_MSG_QUEUE_FULL, errors);
3095  tcp_handle_disconnects (wrk);
3096  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
3097 
3098  return from_frame->n_vectors;
3099 }
3100 
3102  vlib_node_runtime_t * node,
3103  vlib_frame_t * from_frame)
3104 {
3105  return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
3106 }
3107 
3109  vlib_node_runtime_t * node,
3110  vlib_frame_t * from_frame)
3111 {
3112  return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
3113 }
3114 
3115 /* *INDENT-OFF* */
3117 {
3118  .name = "tcp4-rcv-process",
3119  /* Takes a vector of packets. */
3120  .vector_size = sizeof (u32),
3121  .n_errors = TCP_N_ERROR,
3122  .error_strings = tcp_error_strings,
3123  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
3124  .next_nodes =
3125  {
3126 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
3128 #undef _
3129  },
3130  .format_trace = format_tcp_rx_trace_short,
3131 };
3132 /* *INDENT-ON* */
3133 
3134 /* *INDENT-OFF* */
3136 {
3137  .name = "tcp6-rcv-process",
3138  /* Takes a vector of packets. */
3139  .vector_size = sizeof (u32),
3140  .n_errors = TCP_N_ERROR,
3141  .error_strings = tcp_error_strings,
3142  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
3143  .next_nodes =
3144  {
3145 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
3147 #undef _
3148  },
3149  .format_trace = format_tcp_rx_trace_short,
3150 };
3151 /* *INDENT-ON* */
3152 
3153 /**
3154  * LISTEN state processing as per RFC 793 p. 65
3155  */
3158  vlib_frame_t * from_frame, int is_ip4)
3159 {
3160  u32 n_left_from, *from, n_syns = 0, *first_buffer;
3161  u32 my_thread_index = vm->thread_index;
3162 
3163  from = first_buffer = vlib_frame_vector_args (from_frame);
3164  n_left_from = from_frame->n_vectors;
3165 
3166  while (n_left_from > 0)
3167  {
3168  u32 bi0;
3169  vlib_buffer_t *b0;
3170  tcp_rx_trace_t *t0;
3171  tcp_header_t *th0 = 0;
3172  tcp_connection_t *lc0;
3173  ip4_header_t *ip40;
3174  ip6_header_t *ip60;
3175  tcp_connection_t *child0;
3176  u32 error0 = TCP_ERROR_NONE;
3177 
3178  bi0 = from[0];
3179  from += 1;
3180  n_left_from -= 1;
3181 
3182  b0 = vlib_get_buffer (vm, bi0);
3183  lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
3184 
3185  if (is_ip4)
3186  {
3187  ip40 = vlib_buffer_get_current (b0);
3188  th0 = ip4_next_header (ip40);
3189  }
3190  else
3191  {
3192  ip60 = vlib_buffer_get_current (b0);
3193  th0 = ip6_next_header (ip60);
3194  }
3195 
3196  /* Create child session. For syn-flood protection use filter */
3197 
3198  /* 1. first check for an RST: handled in dispatch */
3199  /* if (tcp_rst (th0))
3200  goto drop;
3201  */
3202 
3203  /* 2. second check for an ACK: handled in dispatch */
3204  /* if (tcp_ack (th0))
3205  {
3206  tcp_send_reset (b0, is_ip4);
3207  goto drop;
3208  }
3209  */
3210 
3211  /* 3. check for a SYN (did that already) */
3212 
3213  /* Make sure connection wasn't just created */
3214  child0 = tcp_lookup_connection (lc0->c_fib_index, b0, my_thread_index,
3215  is_ip4);
3216  if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN))
3217  {
3218  error0 = TCP_ERROR_CREATE_EXISTS;
3219  goto drop;
3220  }
3221 
3222  /* Create child session and send SYN-ACK */
3223  child0 = tcp_connection_alloc (my_thread_index);
3224  child0->c_lcl_port = th0->dst_port;
3225  child0->c_rmt_port = th0->src_port;
3226  child0->c_is_ip4 = is_ip4;
3227  child0->state = TCP_STATE_SYN_RCVD;
3228  child0->c_fib_index = lc0->c_fib_index;
3229  child0->cc_algo = lc0->cc_algo;
3230 
3231  if (is_ip4)
3232  {
3233  child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
3234  child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
3235  }
3236  else
3237  {
3238  clib_memcpy_fast (&child0->c_lcl_ip6, &ip60->dst_address,
3239  sizeof (ip6_address_t));
3240  clib_memcpy_fast (&child0->c_rmt_ip6, &ip60->src_address,
3241  sizeof (ip6_address_t));
3242  }
3243 
3244  if (tcp_options_parse (th0, &child0->rcv_opts, 1))
3245  {
3246  error0 = TCP_ERROR_OPTIONS;
3247  tcp_connection_free (child0);
3248  goto drop;
3249  }
3250 
3251  child0->irs = vnet_buffer (b0)->tcp.seq_number;
3252  child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
3253  child0->rcv_las = child0->rcv_nxt;
3254  child0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
3255 
3256  /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
3257  * segments are used to initialize PAWS. */
3258  if (tcp_opts_tstamp (&child0->rcv_opts))
3259  {
3260  child0->tsval_recent = child0->rcv_opts.tsval;
3261  child0->tsval_recent_age = tcp_time_now ();
3262  }
3263 
3264  if (tcp_opts_wscale (&child0->rcv_opts))
3265  child0->snd_wscale = child0->rcv_opts.wscale;
3266 
3267  child0->snd_wnd = clib_net_to_host_u16 (th0->window)
3268  << child0->snd_wscale;
3269  child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
3270  child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
3271 
3272  tcp_connection_init_vars (child0);
3273  child0->rto = TCP_RTO_MIN;
3274 
3275  if (session_stream_accept (&child0->connection, lc0->c_s_index,
3276  lc0->c_thread_index, 0 /* notify */ ))
3277  {
3278  tcp_connection_cleanup (child0);
3279  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
3280  goto drop;
3281  }
3282 
3283  TCP_EVT (TCP_EVT_SYN_RCVD, child0, 1);
3284  child0->tx_fifo_size = transport_tx_fifo_size (&child0->connection);
3285  tcp_send_synack (child0);
3286 
3287  drop:
3288 
3289  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
3290  {
3291  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
3292  clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
3293  clib_memcpy_fast (&t0->tcp_connection, lc0,
3294  sizeof (t0->tcp_connection));
3295  }
3296 
3297  n_syns += (error0 == TCP_ERROR_NONE);
3298  }
3299 
3300  tcp_inc_counter (listen, TCP_ERROR_SYNS_RCVD, n_syns);
3301  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
3302 
3303  return from_frame->n_vectors;
3304 }
3305 
3307  vlib_frame_t * from_frame)
3308 {
3309  return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
3310 }
3311 
3313  vlib_frame_t * from_frame)
3314 {
3315  return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
3316 }
3317 
3318 /* *INDENT-OFF* */
3320 {
3321  .name = "tcp4-listen",
3322  /* Takes a vector of packets. */
3323  .vector_size = sizeof (u32),
3324  .n_errors = TCP_N_ERROR,
3325  .error_strings = tcp_error_strings,
3326  .n_next_nodes = TCP_LISTEN_N_NEXT,
3327  .next_nodes =
3328  {
3329 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
3331 #undef _
3332  },
3333  .format_trace = format_tcp_rx_trace_short,
3334 };
3335 /* *INDENT-ON* */
3336 
3337 /* *INDENT-OFF* */
3339 {
3340  .name = "tcp6-listen",
3341  /* Takes a vector of packets. */
3342  .vector_size = sizeof (u32),
3343  .n_errors = TCP_N_ERROR,
3344  .error_strings = tcp_error_strings,
3345  .n_next_nodes = TCP_LISTEN_N_NEXT,
3346  .next_nodes =
3347  {
3348 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
3350 #undef _
3351  },
3352  .format_trace = format_tcp_rx_trace_short,
3353 };
3354 /* *INDENT-ON* */
3355 
3356 typedef enum _tcp_input_next
3357 {
3367 
3368 #define foreach_tcp4_input_next \
3369  _ (DROP, "ip4-drop") \
3370  _ (LISTEN, "tcp4-listen") \
3371  _ (RCV_PROCESS, "tcp4-rcv-process") \
3372  _ (SYN_SENT, "tcp4-syn-sent") \
3373  _ (ESTABLISHED, "tcp4-established") \
3374  _ (RESET, "tcp4-reset") \
3375  _ (PUNT, "ip4-punt")
3376 
3377 #define foreach_tcp6_input_next \
3378  _ (DROP, "ip6-drop") \
3379  _ (LISTEN, "tcp6-listen") \
3380  _ (RCV_PROCESS, "tcp6-rcv-process") \
3381  _ (SYN_SENT, "tcp6-syn-sent") \
3382  _ (ESTABLISHED, "tcp6-established") \
3383  _ (RESET, "tcp6-reset") \
3384  _ (PUNT, "ip6-punt")
3385 
3386 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
3387 
3388 static void
3390  vlib_buffer_t ** bs, u32 n_bufs, u8 is_ip4)
3391 {
3392  tcp_connection_t *tc;
3393  tcp_header_t *tcp;
3394  tcp_rx_trace_t *t;
3395  int i;
3396 
3397  for (i = 0; i < n_bufs; i++)
3398  {
3399  if (bs[i]->flags & VLIB_BUFFER_IS_TRACED)
3400  {
3401  t = vlib_add_trace (vm, node, bs[i], sizeof (*t));
3402  tc = tcp_connection_get (vnet_buffer (bs[i])->tcp.connection_index,
3403  vm->thread_index);
3404  tcp = vlib_buffer_get_current (bs[i]);
3405  tcp_set_rx_trace_data (t, tc, tcp, bs[i], is_ip4);
3406  }
3407  }
3408 }
3409 
3410 static void
3411 tcp_input_set_error_next (tcp_main_t * tm, u16 * next, u32 * error, u8 is_ip4)
3412 {
3413  if (*error == TCP_ERROR_FILTERED || *error == TCP_ERROR_WRONG_THREAD)
3414  {
3415  *next = TCP_INPUT_NEXT_DROP;
3416  }
3417  else if ((is_ip4 && tm->punt_unknown4) || (!is_ip4 && tm->punt_unknown6))
3418  {
3419  *next = TCP_INPUT_NEXT_PUNT;
3420  *error = TCP_ERROR_PUNT;
3421  }
3422  else
3423  {
3424  *next = TCP_INPUT_NEXT_RESET;
3425  *error = TCP_ERROR_NO_LISTENER;
3426  }
3427 }
3428 
3430 tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error,
3431  u8 is_ip4, u8 is_nolookup)
3432 {
3433  u32 fib_index = vnet_buffer (b)->ip.fib_index;
3434  int n_advance_bytes, n_data_bytes;
3436  tcp_header_t *tcp;
3437  u8 result = 0;
3438 
3439  if (is_ip4)
3440  {
3442  int ip_hdr_bytes = ip4_header_bytes (ip4);
3443  if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp)))
3444  {
3445  *error = TCP_ERROR_LENGTH;
3446  return 0;
3447  }
3448  tcp = ip4_next_header (ip4);
3449  vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4;
3450  n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp));
3451  n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes;
3452 
3453  /* Length check. Checksum computed by ipx_local no need to compute again */
3454  if (PREDICT_FALSE (n_data_bytes < 0))
3455  {
3456  *error = TCP_ERROR_LENGTH;
3457  return 0;
3458  }
3459 
3460  if (!is_nolookup)
3461  tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
3462  &ip4->src_address, tcp->dst_port,
3463  tcp->src_port,
3464  TRANSPORT_PROTO_TCP, thread_index,
3465  &result);
3466  }
3467  else
3468  {
3470  if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp)))
3471  {
3472  *error = TCP_ERROR_LENGTH;
3473  return 0;
3474  }
3475  tcp = ip6_next_header (ip6);
3476  vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6;
3477  n_advance_bytes = tcp_header_bytes (tcp);
3478  n_data_bytes = clib_net_to_host_u16 (ip6->payload_length)
3479  - n_advance_bytes;
3480  n_advance_bytes += sizeof (ip6[0]);
3481 
3482  if (PREDICT_FALSE (n_data_bytes < 0))
3483  {
3484  *error = TCP_ERROR_LENGTH;
3485  return 0;
3486  }
3487 
3488  if (!is_nolookup)
3489  {
3490  if (PREDICT_FALSE
3492  {
3493  ip4_main_t *im = &ip4_main;
3494  fib_index = vec_elt (im->fib_index_by_sw_if_index,
3496  }
3497 
3498  tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
3499  &ip6->src_address,
3500  tcp->dst_port, tcp->src_port,
3501  TRANSPORT_PROTO_TCP,
3502  thread_index, &result);
3503  }
3504  }
3505 
3506  if (is_nolookup)
3507  tc =
3509  tcp.connection_index,
3510  thread_index);
3511 
3512  vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number);
3513  vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number);
3514  vnet_buffer (b)->tcp.data_offset = n_advance_bytes;
3515  vnet_buffer (b)->tcp.data_len = n_data_bytes;
3516  vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number
3517  + n_data_bytes;
3518  vnet_buffer (b)->tcp.flags = 0;
3519 
3520  *error = result ? TCP_ERROR_NONE + result : *error;
3521 
3523 }
3524 
3525 static inline void
3527  vlib_buffer_t * b, u16 * next, u32 * error)
3528 {
3529  tcp_header_t *tcp;
3530  u8 flags;
3531 
3532  tcp = tcp_buffer_hdr (b);
3533  flags = tcp->flags & filter_flags;
3534  *next = tm->dispatch_table[tc->state][flags].next;
3535  *error = tm->dispatch_table[tc->state][flags].error;
3536  tc->segs_in += 1;
3537 
3538  if (PREDICT_FALSE (*error == TCP_ERROR_DISPATCH
3539  || *next == TCP_INPUT_NEXT_RESET))
3540  {
3541  /* Overload tcp flags to store state */
3542  tcp_state_t state = tc->state;
3543  vnet_buffer (b)->tcp.flags = tc->state;
3544 
3545  if (*error == TCP_ERROR_DISPATCH)
3546  clib_warning ("tcp conn %u disp error state %U flags %U",
3547  tc->c_c_index, format_tcp_state, state,
3548  format_tcp_flags, (int) flags);
3549  }
3550 }
3551 
3554  vlib_frame_t * frame, int is_ip4, u8 is_nolookup)
3555 {
3556  u32 n_left_from, *from, thread_index = vm->thread_index;
3557  tcp_main_t *tm = vnet_get_tcp_main ();
3558  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
3559  u16 nexts[VLIB_FRAME_SIZE], *next;
3560 
3561  tcp_set_time_now (tcp_get_worker (thread_index));
3562 
3563  from = vlib_frame_vector_args (frame);
3564  n_left_from = frame->n_vectors;
3565  vlib_get_buffers (vm, from, bufs, n_left_from);
3566 
3567  b = bufs;
3568  next = nexts;
3569 
3570  while (n_left_from >= 4)
3571  {
3572  u32 error0 = TCP_ERROR_NO_LISTENER, error1 = TCP_ERROR_NO_LISTENER;
3573  tcp_connection_t *tc0, *tc1;
3574 
3575  {
3576  vlib_prefetch_buffer_header (b[2], STORE);
3577  CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
3578 
3579  vlib_prefetch_buffer_header (b[3], STORE);
3580  CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
3581  }
3582 
3583  next[0] = next[1] = TCP_INPUT_NEXT_DROP;
3584 
3585  tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
3586  is_nolookup);
3587  tc1 = tcp_input_lookup_buffer (b[1], thread_index, &error1, is_ip4,
3588  is_nolookup);
3589 
3590  if (PREDICT_TRUE (!tc0 + !tc1 == 0))
3591  {
3592  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
3593  ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
3594 
3595  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
3596  vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
3597 
3598  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
3599  tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
3600  }
3601  else
3602  {
3603  if (PREDICT_TRUE (tc0 != 0))
3604  {
3605  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
3606  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
3607  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
3608  }
3609  else
3610  tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
3611 
3612  if (PREDICT_TRUE (tc1 != 0))
3613  {
3614  ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
3615  vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
3616  tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
3617  }
3618  else
3619  tcp_input_set_error_next (tm, &next[1], &error1, is_ip4);
3620  }
3621 
3622  b += 2;
3623  next += 2;
3624  n_left_from -= 2;
3625  }
3626  while (n_left_from > 0)
3627  {
3628  tcp_connection_t *tc0;
3629  u32 error0 = TCP_ERROR_NO_LISTENER;
3630 
3631  if (n_left_from > 1)
3632  {
3633  vlib_prefetch_buffer_header (b[1], STORE);
3634  CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
3635  }
3636 
3637  next[0] = TCP_INPUT_NEXT_DROP;
3638  tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
3639  is_nolookup);
3640  if (PREDICT_TRUE (tc0 != 0))
3641  {
3642  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
3643  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
3644  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
3645  }
3646  else
3647  tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
3648 
3649  b += 1;
3650  next += 1;
3651  n_left_from -= 1;
3652  }
3653 
3655  tcp_input_trace_frame (vm, node, bufs, frame->n_vectors, is_ip4);
3656 
3657  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
3658  return frame->n_vectors;
3659 }
3660 
3662  vlib_node_runtime_t * node,
3663  vlib_frame_t * from_frame)
3664 {
3665  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
3666  1 /* is_nolookup */ );
3667 }
3668 
3670  vlib_node_runtime_t * node,
3671  vlib_frame_t * from_frame)
3672 {
3673  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
3674  1 /* is_nolookup */ );
3675 }
3676 
3677 /* *INDENT-OFF* */
3679 {
3680  .name = "tcp4-input-nolookup",
3681  /* Takes a vector of packets. */
3682  .vector_size = sizeof (u32),
3683  .n_errors = TCP_N_ERROR,
3684  .error_strings = tcp_error_strings,
3685  .n_next_nodes = TCP_INPUT_N_NEXT,
3686  .next_nodes =
3687  {
3688 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3690 #undef _
3691  },
3692  .format_buffer = format_tcp_header,
3693  .format_trace = format_tcp_rx_trace,
3694 };
3695 /* *INDENT-ON* */
3696 
3697 /* *INDENT-OFF* */
3699 {
3700  .name = "tcp6-input-nolookup",
3701  /* Takes a vector of packets. */
3702  .vector_size = sizeof (u32),
3703  .n_errors = TCP_N_ERROR,
3704  .error_strings = tcp_error_strings,
3705  .n_next_nodes = TCP_INPUT_N_NEXT,
3706  .next_nodes =
3707  {
3708 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3710 #undef _
3711  },
3712  .format_buffer = format_tcp_header,
3713  .format_trace = format_tcp_rx_trace,
3714 };
3715 /* *INDENT-ON* */
3716 
3718  vlib_frame_t * from_frame)
3719 {
3720  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
3721  0 /* is_nolookup */ );
3722 }
3723 
3725  vlib_frame_t * from_frame)
3726 {
3727  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
3728  0 /* is_nolookup */ );
3729 }
3730 
3731 /* *INDENT-OFF* */
3733 {
3734  .name = "tcp4-input",
3735  /* Takes a vector of packets. */
3736  .vector_size = sizeof (u32),
3737  .n_errors = TCP_N_ERROR,
3738  .error_strings = tcp_error_strings,
3739  .n_next_nodes = TCP_INPUT_N_NEXT,
3740  .next_nodes =
3741  {
3742 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3744 #undef _
3745  },
3746  .format_buffer = format_tcp_header,
3747  .format_trace = format_tcp_rx_trace,
3748 };
3749 /* *INDENT-ON* */
3750 
3751 /* *INDENT-OFF* */
3753 {
3754  .name = "tcp6-input",
3755  /* Takes a vector of packets. */
3756  .vector_size = sizeof (u32),
3757  .n_errors = TCP_N_ERROR,
3758  .error_strings = tcp_error_strings,
3759  .n_next_nodes = TCP_INPUT_N_NEXT,
3760  .next_nodes =
3761  {
3762 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3764 #undef _
3765  },
3766  .format_buffer = format_tcp_header,
3767  .format_trace = format_tcp_rx_trace,
3768 };
3769 /* *INDENT-ON* */
3770 
3771 #ifndef CLIB_MARCH_VARIANT
3772 static void
3774 {
3775  int i, j;
3776  for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
3777  for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
3778  {
3779  tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
3780  tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
3781  }
3782 
3783 #define _(t,f,n,e) \
3784 do { \
3785  tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
3786  tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
3787 } while (0)
3788 
3789  /* RFC 793: In LISTEN if RST drop and if ACK return RST */
3790  _(LISTEN, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3791  _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_ACK_INVALID);
3792  _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_INVALID_CONNECTION);
3793  _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
3795  TCP_ERROR_ACK_INVALID);
3797  TCP_ERROR_SEGMENT_INVALID);
3799  TCP_ERROR_SEGMENT_INVALID);
3801  TCP_ERROR_INVALID_CONNECTION);
3802  _(LISTEN, TCP_FLAG_FIN, TCP_INPUT_NEXT_RESET, TCP_ERROR_SEGMENT_INVALID);
3804  TCP_ERROR_SEGMENT_INVALID);
3806  TCP_ERROR_SEGMENT_INVALID);
3808  TCP_ERROR_NONE);
3810  TCP_ERROR_SEGMENT_INVALID);
3812  TCP_ERROR_SEGMENT_INVALID);
3814  TCP_ERROR_SEGMENT_INVALID);
3816  TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3817  /* ACK for for a SYN-ACK -> tcp-rcv-process. */
3818  _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3819  _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3821  TCP_ERROR_NONE);
3822  _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3824  TCP_ERROR_NONE);
3826  TCP_ERROR_NONE);
3827  _(SYN_RCVD, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3828  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3829  _(SYN_RCVD, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3831  TCP_ERROR_NONE);
3833  TCP_ERROR_NONE);
3834  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3835  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3837  TCP_ERROR_NONE);
3838  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3839  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3840  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3841  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3843  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3844  _(SYN_RCVD, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3845  /* SYN-ACK for a SYN */
3847  TCP_ERROR_NONE);
3848  _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3849  _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3851  TCP_ERROR_NONE);
3852  _(SYN_SENT, TCP_FLAG_FIN, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3854  TCP_ERROR_NONE);
3855  /* ACK for for established connection -> tcp-established. */
3856  _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3857  /* FIN for for established connection -> tcp-established. */
3858  _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3860  TCP_ERROR_NONE);
3862  TCP_ERROR_NONE);
3863  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3864  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3866  TCP_ERROR_NONE);
3867  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3868  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3869  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3870  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3871  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3872  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3873  _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3875  TCP_ERROR_NONE);
3876  _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3878  TCP_ERROR_NONE);
3880  TCP_ERROR_NONE);
3881  _(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3882  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3883  _(ESTABLISHED, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3884  /* ACK or FIN-ACK to our FIN */
3885  _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3887  TCP_ERROR_NONE);
3888  /* FIN in reply to our FIN from the other side */
3889  _(FIN_WAIT_1, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3890  _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3892  TCP_ERROR_NONE);
3893  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3894  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3895  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3896  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3897  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3898  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3900  TCP_ERROR_NONE);
3901  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3902  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3903  _(FIN_WAIT_1, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3905  TCP_ERROR_NONE);
3907  TCP_ERROR_NONE);
3908  _(FIN_WAIT_1, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3909  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3910  _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3912  TCP_ERROR_NONE);
3913  _(CLOSING, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3914  _(CLOSING, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3915  _(CLOSING, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3917  TCP_ERROR_NONE);
3919  TCP_ERROR_NONE);
3920  _(CLOSING, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3921  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3922  _(CLOSING, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3924  TCP_ERROR_NONE);
3925  _(CLOSING, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3927  TCP_ERROR_NONE);
3929  TCP_ERROR_NONE);
3930  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3931  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3933  TCP_ERROR_NONE);
3934  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3935  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3936  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3937  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3939  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3940  /* FIN confirming that the peer (app) has closed */
3941  _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3942  _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3944  TCP_ERROR_NONE);
3945  _(FIN_WAIT_2, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3947  TCP_ERROR_NONE);
3948  _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3950  TCP_ERROR_NONE);
3951  _(CLOSE_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3953  TCP_ERROR_NONE);
3954  _(LAST_ACK, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3955  _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3956  _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3958  TCP_ERROR_NONE);
3960  TCP_ERROR_NONE);
3961  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3962  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3964  TCP_ERROR_NONE);
3965  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3966  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3967  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3968  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3970  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3971  _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3973  TCP_ERROR_NONE);
3974  _(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3976  TCP_ERROR_NONE);
3978  TCP_ERROR_NONE);
3979  _(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3980  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3981  _(TIME_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3982  _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3984  TCP_ERROR_NONE);
3985  _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3987  TCP_ERROR_NONE);
3988  _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3989  /* RFC793 CLOSED: An incoming segment containing a RST is discarded. An
3990  * incoming segment not containing a RST causes a RST to be sent in
3991  * response.*/
3992  _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
3994  TCP_ERROR_CONNECTION_CLOSED);
3995  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
3996  _(CLOSED, TCP_FLAG_SYN, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
3998  TCP_ERROR_NONE);
3999 #undef _
4000 }
4001 
4002 static clib_error_t *
4004 {
4005  clib_error_t *error = 0;
4006  tcp_main_t *tm = vnet_get_tcp_main ();
4007 
4008  if ((error = vlib_call_init_function (vm, tcp_init)))
4009  return error;
4010 
4011  /* Initialize dispatch table. */
4013 
4014  return error;
4015 }
4016 
4018 
4019 #endif /* CLIB_MARCH_VARIANT */
4020 
4021 /*
4022  * fd.io coding-style-patch-verification: ON
4023  *
4024  * Local Variables:
4025  * eval: (c-set-style "gnu")
4026  * End:
4027  */
static void tcp_program_disconnect(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:1661
#define tcp_in_cong_recovery(tc)
Definition: tcp.h:474
static int tcp_session_enqueue_ooo(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue out-of-order data.
Definition: tcp_input.c:1844
static void tcp_update_timestamp(tcp_connection_t *tc, u32 seq, u32 seq_end)
Update tsval recent.
Definition: tcp_input.c:251
static sack_scoreboard_hole_t * scoreboard_insert_hole(sack_scoreboard_t *sb, u32 prev_index, u32 start, u32 end)
Definition: tcp_input.c:746
static u8 tcp_scoreboard_is_sane_post_recovery(tcp_connection_t *tc)
Test that scoreboard is sane after recovery.
Definition: tcp_input.c:990
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:124
void scoreboard_clear(sack_scoreboard_t *sb)
Definition: tcp_input.c:947
void tcp_program_retransmit(tcp_connection_t *tc)
Definition: tcp_output.c:1206
End of options.
Definition: tcp_packet.h:104
u32 flags
Definition: vhost_user.h:141
#define clib_min(x, y)
Definition: clib.h:295
#define CLIB_UNUSED(x)
Definition: clib.h:82
u32 * pending_disconnects
vector of pending disconnect notifications
Definition: tcp.h:522
vlib_node_registration_t tcp6_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp6_rcv_process_node)
Definition: tcp_input.c:3135
static u32 ip6_fib_table_fwding_lookup(u32 fib_index, const ip6_address_t *dst)
Definition: ip6_fib.h:67
#define tcp_in_recovery(tc)
Definition: tcp.h:465
static f64 tcp_time_now_us(u32 thread_index)
Definition: tcp.h:1021
static void tcp_rcv_fin(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b, u32 *error)
Definition: tcp_input.c:1692
#define TCP_OPTION_LEN_SACK_PERMITTED
Definition: tcp_packet.h:166
#define seq_leq(_s1, _s2)
Definition: tcp.h:867
struct _sack_block sack_block_t
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
Definition: tcp_input.c:1001
static void vlib_buffer_free(vlib_main_t *vm, u32 *buffers, u32 n_buffers)
Free buffers Frees the entire buffer chain for each buffer.
Definition: buffer_funcs.h:865
#define timestamp_leq(_t1, _t2)
Definition: tcp.h:874
ip4_address_t src_address
Definition: ip4_packet.h:170
static u8 tcp_cc_is_spurious_retransmit(tcp_connection_t *tc)
Definition: tcp_input.c:1303
transport_connection_t * session_lookup_connection_wt6(u32 fib_index, ip6_address_t *lcl, ip6_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index, u8 *result)
Lookup connection with ip6 and transport layer information.
vnet_main_t * vnet_get_main(void)
Definition: misc.c:46
enum _tcp_state_next tcp_state_next_t
static vnet_hw_interface_t * vnet_get_sup_hw_interface(vnet_main_t *vnm, u32 sw_if_index)
#define tcp_rst(_th)
Definition: tcp_packet.h:81
Selective Ack permitted.
Definition: tcp_packet.h:108
#define TCP_FLAG_SYN
Definition: fa_node.h:13
#define tcp_opts_tstamp(_to)
Definition: tcp_packet.h:156
#define PREDICT_TRUE(x)
Definition: clib.h:112
#define tcp_inc_err_counter(cnts, err, val)
Definition: tcp_input.c:2126
unsigned long u64
Definition: types.h:89
#define tcp_store_err_counters(node_id, cnts)
Definition: tcp_input.c:2130
static void tcp_dispatch_table_init(tcp_main_t *tm)
Definition: tcp_input.c:3773
#define clib_memcpy_fast(a, b, c)
Definition: string.h:81
static u8 * format_tcp_rx_trace_short(u8 *s, va_list *args)
Definition: tcp_input.c:2037
static int tcp_segment_rcv(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b)
Receive buffer for connection and handle acks.
Definition: tcp_input.c:1948
clib_memset(h->entries, 0, sizeof(h->entries[0]) *entries)
struct _sack_scoreboard sack_scoreboard_t
static uword tcp46_established_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4)
Definition: tcp_input.c:2140
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp.h:770
void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_input.c:478
svm_fifo_t * rx_fifo
Pointers to rx/tx buffers.
#define tcp_doff(_th)
Definition: tcp_packet.h:78
struct _tcp_main tcp_main_t
u32 thread_index
Definition: main.h:218
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
Definition: tcp.c:520
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:113
int session_main_flush_enqueue_events(u8 transport_proto, u32 thread_index)
Flushes queue of sessions that are to be notified of new data enqueued events.
Definition: session.c:658
u8 data[0]
Packet data.
Definition: buffer.h:181
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:522
#define tcp_recovery_off(tc)
Definition: tcp.h:463
#define clib_abs(x)
Definition: clib.h:302
#define vec_add2(V, P, N)
Add N elements to end of vector V, return pointer to new elements in P.
Definition: vec.h:560
int i
#define THZ
TCP tick frequency.
Definition: tcp.h:28
static u32 format_get_indent(u8 *s)
Definition: format.h:72
vlib_node_registration_t tcp4_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp4_rcv_process_node)
Definition: tcp_input.c:3116
u32 * fib_index_by_sw_if_index
Table index indexed by software interface.
Definition: ip4.h:121
struct _tcp_connection tcp_connection_t
static session_t * session_get(u32 si, u32 thread_index)
Definition: session.h:293
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:424
static u32 tcp_available_cc_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
Definition: tcp.h:970
#define tcp_opts_sack(_to)
Definition: tcp_packet.h:158
u8 data[128]
Definition: ipsec.api:251
tcp_connection_t tcp_connection
Definition: tcp_input.c:2017
static u8 tcp_sack_vector_is_sane(sack_block_t *sacks)
Definition: tcp_input.c:1714
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
Definition: tcp.h:733
#define VLIB_NODE_FN(node)
Definition: node.h:202
static void tcp_cc_congestion_undo(tcp_connection_t *tc)
Definition: tcp_input.c:1284
#define tcp_disconnect_pending_on(tc)
Definition: tcp.h:468
int session_enqueue_stream_connection(transport_connection_t *tc, vlib_buffer_t *b, u32 offset, u8 queue_event, u8 is_in_order)
Definition: session.c:413
u64 session_lookup_half_open_handle(transport_connection_t *tc)
No operation.
Definition: tcp_packet.h:105
format_function_t format_tcp_flags
Definition: tcp.h:65
#define pool_get(P, E)
Allocate an object E from a pool P (unspecified alignment).
Definition: pool.h:236
u8 n_sack_blocks
Number of SACKs blocks.
Definition: tcp_packet.h:151
struct _tcp_header tcp_header_t
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
Definition: tcp.c:211
ip6_address_t src_address
Definition: ip6_packet.h:383
void scoreboard_clear_reneging(sack_scoreboard_t *sb, u32 start, u32 end)
Definition: tcp_input.c:966
u32 * pending_deq_acked
vector of pending ack dequeues
Definition: tcp.h:519
unsigned char u8
Definition: types.h:56
#define tcp_inc_counter(node_id, err, count)
Definition: tcp_input.c:2118
vlib_node_registration_t tcp6_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp6_syn_sent_node)
Definition: tcp_input.c:2702
struct _sack_scoreboard_hole sack_scoreboard_hole_t
u8 wscale
Option flags, see above.
Definition: tcp_packet.h:146
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
static tcp_connection_t * tcp_lookup_connection(u32 fib_index, vlib_buffer_t *b, u8 thread_index, u8 is_ip4)
Lookup transport connection.
Definition: tcp_input.c:2344
double f64
Definition: types.h:142
#define tcp_fastrecovery_on(tc)
Definition: tcp.h:460
Limit MSS.
Definition: tcp_packet.h:106
void session_transport_closing_notify(transport_connection_t *tc)
Notification from transport that connection is being closed.
Definition: session.c:856
sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
Definition: tcp_input.c:671
#define TCP_TICK
TCP tick period (s)
Definition: tcp.h:27
void scoreboard_init_rxt(sack_scoreboard_t *sb, u32 snd_una)
Definition: tcp_input.c:925
#define tcp_is_fin(_th)
Definition: tcp_packet.h:90
#define seq_gt(_s1, _s2)
Definition: tcp.h:868
static u8 * format_tcp_rx_trace(u8 *s, va_list *args)
Definition: tcp_input.c:2021
static void tcp_connection_set_state(tcp_connection_t *tc, tcp_state_t state)
Definition: tcp.h:739
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
Definition: tcp.c:692
#define tcp_cfg
Definition: tcp.h:676
vl_api_interface_index_t sw_if_index
Definition: gre.api:50
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:173
vlib_node_registration_t tcp4_established_node
(constructor) VLIB_REGISTER_NODE (tcp4_established_node)
Definition: tcp_input.c:2238
#define always_inline
Definition: clib.h:98
#define TCP_OPTION_LEN_SACK_BLOCK
Definition: tcp_packet.h:168
ip4_address_t dst_address
Definition: ip4_packet.h:170
static u32 tcp_available_output_snd_space(const tcp_connection_t *tc)
Definition: tcp.h:955
#define TCP_FLAG_ACK
Definition: fa_node.h:16
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:129
transport_connection_t * session_lookup_connection_wt4(u32 fib_index, ip4_address_t *lcl, ip4_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index, u8 *result)
Lookup connection with ip4 and transport layer information.
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp.h:693
vnet_hw_interface_flags_t flags
Definition: interface.h:506
#define vlib_prefetch_buffer_header(b, type)
Prefetch buffer metadata.
Definition: buffer.h:203
static int tcp_segment_validate(tcp_worker_ctx_t *wrk, tcp_connection_t *tc0, vlib_buffer_t *b0, tcp_header_t *th0, u32 *error0)
Validate incoming segment as per RFC793 p.
Definition: tcp_input.c:279
enum _tcp_state tcp_state_t
#define TCP_ALWAYS_ACK
On/off delayed acks.
Definition: tcp.h:39
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
Definition: tcp_input.c:3752
static u8 tcp_ack_is_dupack(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una)
Check if duplicate ack as per RFC5681 Sec.
Definition: tcp_input.c:1534
vhost_vring_state_t state
Definition: vhost_user.h:146
#define TCP_RTO_MAX
Definition: tcp.h:99
static u32 ooo_segment_length(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:722
static void * ip4_next_header(ip4_header_t *i)
Definition: ip4_packet.h:241
static u32 tcp_time_now(void)
Definition: tcp.h:999
sack_block_t * sacks
SACK blocks.
Definition: tcp_packet.h:150
unsigned int u32
Definition: types.h:88
#define vec_end(v)
End (last data address) of vector.
#define vlib_call_init_function(vm, x)
Definition: init.h:270
static void tcp_node_inc_counter_i(vlib_main_t *vm, u32 tcp4_node, u32 tcp6_node, u8 is_ip4, u32 evt, u32 val)
Definition: tcp_input.c:2102
#define TCP_MAX_SACK_BLOCKS
Max number of SACK blocks stored.
Definition: tcp.h:163
#define VLIB_FRAME_SIZE
Definition: node.h:378
static void tcp_cc_init_congestion(tcp_connection_t *tc)
Init loss recovery/fast recovery.
Definition: tcp_input.c:1258
#define tcp_validate_txf_size(_tc, _a)
Definition: tcp.h:1211
static int tcp_options_parse(tcp_header_t *th, tcp_options_t *to, u8 is_syn)
Parse TCP header options.
Definition: tcp_input.c:127
#define timestamp_lt(_t1, _t2)
Definition: tcp.h:873
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:1105
#define TCP_OPTION_LEN_WINDOW_SCALE
Definition: tcp_packet.h:165
static void svm_fifo_newest_ooo_segment_reset(svm_fifo_t *f)
Definition: svm_fifo.h:706
static heap_elt_t * first(heap_header_t *h)
Definition: heap.c:59
void scoreboard_init(sack_scoreboard_t *sb)
Definition: tcp_input.c:939
The identity of a DPO is a combination of its type and its instance number/index of objects of that t...
Definition: dpo.h:170
static u8 tcp_should_fastrecover(tcp_connection_t *tc, u8 has_sack)
Definition: tcp_input.c:1317
vlib_main_t * vm
convenience pointer to this thread&#39;s vlib main
Definition: tcp.h:525
#define TCP_INVALID_SACK_HOLE_INDEX
Definition: tcp.h:164
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:514
static void tcp_program_dequeue(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:646
void tcp_send_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1165
static void tcp_handle_disconnects(tcp_worker_ctx_t *wrk)
Definition: tcp_input.c:1671
static uword tcp46_listen_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
LISTEN state processing as per RFC 793 p.
Definition: tcp_input.c:3157
#define tcp_in_fastrecovery(tc)
Definition: tcp.h:464
void tcp_connection_tx_pacer_reset(tcp_connection_t *tc, u32 window, u32 start_bucket)
Definition: tcp.c:1402
int tcp_fastrecovery_prr_snd_space(tcp_connection_t *tc)
Estimate send space using proportional rate reduction (RFC6937)
Definition: tcp_output.c:1821
static void tcp_input_set_error_next(tcp_main_t *tm, u16 *next, u32 *error, u8 is_ip4)
Definition: tcp_input.c:3411
tcp_connection_t * tcp_connection_alloc_w_base(u8 thread_index, tcp_connection_t *base)
Definition: tcp.c:312
static const dpo_id_t * load_balance_get_bucket_i(const load_balance_t *lb, u32 bucket)
Definition: load_balance.h:229
vlib_node_registration_t tcp4_input_nolookup_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_nolookup_node)
Definition: tcp_input.c:3678
unsigned short u16
Definition: types.h:57
#define foreach_tcp4_input_next
Definition: tcp_input.c:3368
tcp_connection_t * tcp_connection_alloc(u8 thread_index)
Definition: tcp.c:299
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:229
#define filter_flags
Definition: tcp_input.c:3386
void tcp_connection_tx_pacer_update(tcp_connection_t *tc)
Definition: tcp.c:1392
#define pool_put(P, E)
Free an object E in pool P.
Definition: pool.h:286
static int tcp_buffer_discard_bytes(vlib_buffer_t *b, u32 n_bytes_to_drop)
Definition: tcp_input.c:1915
static void tcp_check_tx_offload(tcp_connection_t *tc, int is_ipv4)
Definition: tcp_input.c:2385
#define foreach_tcp6_input_next
Definition: tcp_input.c:3377
#define TCP_TIMER_HANDLE_INVALID
Definition: tcp.h:92
The FIB DPO provieds;.
Definition: load_balance.h:106
static void tcp_input_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t **bs, u32 n_bufs, u8 is_ip4)
Definition: tcp_input.c:3389
int ip6_address_compare(ip6_address_t *a1, ip6_address_t *a2)
Definition: ip46_cli.c:60
static void tcp_cc_rcv_cong_ack(tcp_connection_t *tc, tcp_cc_ack_t ack_type, tcp_rate_sample_t *rs)
Definition: tcp.h:1053
#define PREDICT_FALSE(x)
Definition: clib.h:111
static int tcp_rcv_ack_no_cc(tcp_connection_t *tc, vlib_buffer_t *b, u32 *error)
Definition: tcp_input.c:421
#define vec_del1(v, i)
Delete the element at index I.
Definition: vec.h:804
#define TCP_FLAG_FIN
Definition: fa_node.h:12
static void tcp_cc_handle_event(tcp_connection_t *tc, tcp_rate_sample_t *rs, u32 is_dack)
One function to rule them all ...
Definition: tcp_input.c:1420
vlib_node_registration_t tcp4_listen_node
(constructor) VLIB_REGISTER_NODE (tcp4_listen_node)
Definition: tcp_input.c:3319
#define TCP_OPTION_LEN_TIMESTAMP
Definition: tcp_packet.h:167
static ooo_segment_t * svm_fifo_newest_ooo_segment(svm_fifo_t *f)
Definition: svm_fifo.h:698
u32 tcp_sack_list_bytes(tcp_connection_t *tc)
Definition: tcp_input.c:1786
Selective Ack block.
Definition: tcp_packet.h:109
vlib_node_registration_t tcp6_established_node
(constructor) VLIB_REGISTER_NODE (tcp6_established_node)
Definition: tcp_input.c:2257
sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
Definition: tcp_input.c:695
static int tcp_can_delack(tcp_connection_t *tc)
Check if ACK could be delayed.
Definition: tcp_input.c:1899
static void vlib_node_increment_counter(vlib_main_t *vm, u32 node_index, u32 counter_index, u64 increment)
Definition: node_funcs.h:1150
static int tcp_cc_recover(tcp_connection_t *tc)
Definition: tcp_input.c:1345
#define TCP_FLAG_RST
Definition: fa_node.h:14
#define TCP_DBG(_fmt, _args...)
Definition: tcp_debug.h:146
static int tcp_rcv_ack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *th, u32 *error)
Process incoming ACK.
Definition: tcp_input.c:1566
#define TCP_MAX_WND_SCALE
Definition: tcp_packet.h:172
void tcp_connection_free(tcp_connection_t *tc)
Definition: tcp.c:325
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:169
vlib_node_registration_t tcp4_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp4_syn_sent_node)
Definition: tcp_input.c:2683
u16 n_vectors
Definition: node.h:397
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:80
vlib_main_t * vm
Definition: buffer.c:323
int ip4_address_compare(ip4_address_t *a1, ip4_address_t *a2)
Definition: ip46_cli.c:53
static_always_inline void vlib_buffer_enqueue_to_next(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 *nexts, uword count)
Definition: buffer_node.h:332
static void tcp_set_rx_trace_data(tcp_rx_trace_t *t0, tcp_connection_t *tc0, tcp_header_t *th0, vlib_buffer_t *b0, u8 is_ip4)
Definition: tcp_input.c:2052
void tcp_program_dupack(tcp_connection_t *tc)
Definition: tcp_output.c:1194
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
Definition: tcp_output.c:861
#define TCP_DUPACK_THRESHOLD
Definition: tcp.h:37
static u32 tcp_tstamp(tcp_connection_t *tc)
Generate timestamp for tcp connection.
Definition: tcp.h:1014
static tcp_connection_t * tcp_input_lookup_buffer(vlib_buffer_t *b, u8 thread_index, u32 *error, u8 is_ip4, u8 is_nolookup)
Definition: tcp_input.c:3430
format_function_t format_tcp_state
Definition: tcp.h:64
static void tcp_cc_undo_recovery(tcp_connection_t *tc)
Definition: tcp.h:1078
static void scoreboard_update_bytes(sack_scoreboard_t *sb, u32 ack, u32 snd_mss)
Definition: tcp_input.c:794
#define clib_warning(format, args...)
Definition: error.h:59
Don&#39;t register connection in lookup Does not apply to local apps and transports using the network lay...
tcp_header_t tcp_header
Definition: tcp_input.c:2016
format_function_t format_tcp_header
Definition: format.h:101
struct _transport_connection transport_connection_t
f64 rtt_time
RTT for sample.
Definition: tcp.h:282
#define pool_is_free_index(P, I)
Use free bitmap to query whether given index is free.
Definition: pool.h:283
#define ARRAY_LEN(x)
Definition: clib.h:62
#define TCP_RTT_MAX
Definition: tcp.h:101
u16 mss
Maximum segment size advertised.
Definition: tcp_packet.h:147
static void * ip6_next_header(ip6_header_t *i)
Definition: ip6_packet.h:410
static u32 transport_max_tx_dequeue(transport_connection_t *tc)
Definition: session.h:476
void tcp_send_synack(tcp_connection_t *tc)
Definition: tcp_output.c:962
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:1129
#define TCP_PAWS_IDLE
24 days
Definition: tcp.h:30
vslo right
#define ASSERT(truth)
#define tcp_syn(_th)
Definition: tcp_packet.h:80
static clib_error_t * tcp_input_init(vlib_main_t *vm)
Definition: tcp_input.c:4003
#define tcp_fastrecovery_first_on(tc)
Definition: tcp.h:471
static void tcp_estimate_rtt(tcp_connection_t *tc, u32 mrtt)
Compute smoothed RTT as per VJ&#39;s &#39;88 SIGCOMM and RFC6298.
Definition: tcp_input.c:454
static int tcp_update_rtt(tcp_connection_t *tc, tcp_rate_sample_t *rs, u32 ack)
Update RTT estimate and RTO timer.
Definition: tcp_input.c:497
enum _tcp_rcv_process_next tcp_rcv_process_next_t
static load_balance_t * load_balance_get(index_t lbi)
Definition: load_balance.h:220
#define seq_geq(_s1, _s2)
Definition: tcp.h:869
IPv4 main type.
Definition: ip4.h:105
static void tcp_cc_update(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp_input.c:1396
static void tcp_handle_postponed_dequeues(tcp_worker_ctx_t *wrk)
Dequeue bytes for connections that have received acks in last burst.
Definition: tcp_input.c:597
void tcp_bt_sample_delivery_rate(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Generate a delivery rate sample from recently acked bytes.
Definition: tcp_bt.c:582
static index_t ip4_fib_forwarding_lookup(u32 fib_index, const ip4_address_t *addr)
Definition: ip4_fib.h:160
static void tcp_estimate_initial_rtt(tcp_connection_t *tc)
Definition: tcp_input.c:551
static void vlib_buffer_advance(vlib_buffer_t *b, word l)
Advance current data pointer by the supplied (signed!) amount.
Definition: buffer.h:248
static int tcp_segment_check_paws(tcp_connection_t *tc)
RFC1323: Check against wrapped sequence numbers (PAWS).
Definition: tcp_input.c:241
static uword ip6_address_is_link_local_unicast(const ip6_address_t *a)
Definition: ip6_packet.h:326
static u8 tcp_cc_is_spurious_timeout_rxt(tcp_connection_t *tc)
Definition: tcp_input.c:1294
static void tcp_established_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, u8 is_ip4)
Definition: tcp_input.c:2068
enum _tcp_input_next tcp_input_next_t
static void scoreboard_update_sacked_rxt(sack_scoreboard_t *sb, u32 start, u32 end, u8 has_rxt)
Definition: tcp_input.c:783
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
Definition: tcp_input.c:1737
#define tcp_fastrecovery_first_off(tc)
Definition: tcp.h:472
int session_stream_accept_notify(transport_connection_t *tc)
Definition: session.c:994
Out-of-order segment.
Definition: svm_fifo.h:29
static u8 tcp_segment_in_rcv_wnd(tcp_connection_t *tc, u32 seq, u32 end_seq)
Validate segment sequence number.
Definition: tcp_input.c:112
#define clib_max(x, y)
Definition: clib.h:288
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
static u32 tcp_time_now_w_thread(u32 thread_index)
Definition: tcp.h:1005
static clib_error_t * tcp_init(vlib_main_t *vm)
Definition: tcp.c:1669
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
#define vec_elt(v, i)
Get vector value at index i.
u8 ip_is_zero(ip46_address_t *ip46_address, u8 is_ip4)
Definition: ip.c:20
#define seq_lt(_s1, _s2)
Definition: tcp.h:866
#define tcp_is_syn(_th)
Definition: tcp_packet.h:89
#define tcp_opts_wscale(_to)
Definition: tcp_packet.h:157
enum _tcp_syn_sent_next tcp_syn_sent_next_t
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u32 thread_index, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:778
static void tcp_update_snd_wnd(tcp_connection_t *tc, u32 seq, u32 ack, u32 snd_wnd)
Try to update snd_wnd based on feedback received from peer.
Definition: tcp_input.c:1220
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
Definition: tcp.c:343
u32 tsval
Timestamp value.
Definition: tcp_packet.h:148
enum _tcp_established_next tcp_established_next_t
u16 payload_length
Definition: ip6_packet.h:374
u32 tsecr
Echoed/reflected time stamp.
Definition: tcp_packet.h:149
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
Definition: tcp_input.c:3732
index_t dpoi_index
the index of objects of that type
Definition: dpo.h:186
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:1015
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
enum _tcp_listen_next tcp_listen_next_t
#define foreach_tcp_state_next
Definition: tcp_input.c:31
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:140
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
Definition: tcp.h:982
static u32 scoreboard_hole_bytes(sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:665
static void tcp_cc_rcv_ack(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp.h:1046
static tcp_worker_ctx_t * tcp_get_worker(u32 thread_index)
Definition: tcp.h:687
void session_transport_closed_notify(transport_connection_t *tc)
Notification from transport that it is closed.
Definition: session.c:944
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
Definition: tcp.h:1192
VLIB buffer representation.
Definition: buffer.h:102
static int tcp_session_enqueue_data(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue data for delivery to application.
Definition: tcp_input.c:1797
static u8 tcp_should_fastrecover_sack(tcp_connection_t *tc)
Definition: tcp_input.c:1309
u64 uword
Definition: types.h:112
#define seq_max(_s1, _s2)
Definition: tcp.h:870
sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:679
sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:687
static void * vlib_frame_vector_args(vlib_frame_t *f)
Get pointer to frame vector data.
Definition: node_funcs.h:244
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
Definition: tcp.c:724
static void tcp_cc_recovered(tcp_connection_t *tc)
Definition: tcp.h:1072
static void scoreboard_remove_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:711
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_unsent, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
Definition: tcp_input.c:865
#define TCP_OPTION_LEN_MSS
Definition: tcp_packet.h:164
sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
Definition: tcp_input.c:703
void transport_connection_tx_pacer_reset_bucket(transport_connection_t *tc)
Reset tx pacer bucket.
Definition: transport.c:640
#define tcp_disconnect_pending(tc)
Definition: tcp.h:467
left
#define TCP_RTO_MIN
Definition: tcp.h:100
static u32 ooo_segment_offset_prod(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:712
struct clib_bihash_value offset
template key/value backing page structure
#define tcp_scoreboard_trace_add(_tc, _ack)
Definition: tcp.h:229
u8 * format_tcp_connection(u8 *s, va_list *args)
Definition: tcp.c:1060
static u8 tcp_recovery_no_snd_space(tcp_connection_t *tc)
Definition: tcp_input.c:579
#define vnet_buffer(b)
Definition: buffer.h:365
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp.h:714
static u32 scoreboard_hole_index(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:658
static u8 tcp_lookup_is_valid(tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *hdr)
Definition: tcp_input.c:2277
ip4_main_t ip4_main
Global ip4 main structure.
Definition: ip4_forward.c:1076
static int tcp_header_bytes(tcp_header_t *t)
Definition: tcp_packet.h:93
int session_stream_connect_notify(transport_connection_t *tc, u8 is_fail)
Definition: session.c:756
#define tcp_disconnect_pending_off(tc)
Definition: tcp.h:469
static u32 vlib_num_workers()
Definition: threads.h:367
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
Definition: tcp.c:239
u16 flags
Copy of main node flags.
Definition: node.h:509
Window scale.
Definition: tcp_packet.h:107
u32 session_tx_fifo_dequeue_drop(transport_connection_t *tc, u32 max_bytes)
Definition: session.c:510
void tcp_program_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1184
vlib_node_registration_t tcp6_listen_node
(constructor) VLIB_REGISTER_NODE (tcp6_listen_node)
Definition: tcp_input.c:3338
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:159
static int ip4_header_bytes(const ip4_header_t *i)
Definition: ip4_packet.h:235
Timestamps.
Definition: tcp_packet.h:110
int session_stream_accept(transport_connection_t *tc, u32 listener_index, u32 thread_index, u8 notify)
Accept a stream session.
Definition: session.c:1011
static_always_inline void vlib_get_buffers(vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, int count)
Translate array of buffer indices into buffer pointers.
Definition: buffer_funcs.h:244
#define VLIB_NODE_FLAG_TRACE
Definition: node.h:302
tcp_bts_flags_t flags
Rate sample flags from bt sample.
Definition: tcp.h:286
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:59
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:167
static uword tcp46_input_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4, u8 is_nolookup)
Definition: tcp_input.c:3553
static void tcp_persist_timer_set(tcp_connection_t *tc)
Definition: tcp.h:1165
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:681
static uword tcp46_syn_sent_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:2415
#define tcp_fastrecovery_off(tc)
Definition: tcp.h:461
static uword tcp46_rcv_process_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED as per RFC793 p...
Definition: tcp_input.c:2725
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:1152
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:85
static void tcp_input_dispatch_buffer(tcp_main_t *tm, tcp_connection_t *tc, vlib_buffer_t *b, u16 *next, u32 *error)
Definition: tcp_input.c:3526
vlib_node_registration_t tcp6_input_nolookup_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_nolookup_node)
Definition: tcp_input.c:3698
static u32 tcp_set_time_now(tcp_worker_ctx_t *wrk)
Definition: tcp.h:1027
#define tcp_ack(_th)
Definition: tcp_packet.h:83
static u32 transport_tx_fifo_size(transport_connection_t *tc)
Definition: session.h:497
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp.h:1206
transport_connection_t * session_lookup_half_open_connection(u64 handle, u8 proto, u8 is_ip4)
Definition: defs.h:46
static tcp_connection_t * tcp_listener_get(u32 tli)
Definition: tcp.h:764
static void tcp_cc_congestion(tcp_connection_t *tc)
Definition: tcp.h:1060
ip6_address_t dst_address
Definition: ip6_packet.h:383
static u8 tcp_ack_is_cc_event(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una, u8 *is_dack)
Checks if ack is a congestion control event.
Definition: tcp_input.c:1547
static void tcp_persist_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:1186
static char * tcp_error_strings[]
Definition: tcp_input.c:24
#define TCP_EVT(_evt, _args...)
Definition: tcp_debug.h:145
static uword pool_elts(void *v)
Number of active elements in a pool.
Definition: pool.h:128