FD.io VPP  v17.10-9-gd594711
Vector Packet Processing
tcp_input.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <vppinfra/sparse_vec.h>
17 #include <vnet/tcp/tcp_packet.h>
18 #include <vnet/tcp/tcp.h>
19 #include <vnet/session/session.h>
20 #include <math.h>
21 
22 static char *tcp_error_strings[] = {
23 #define tcp_error(n,s) s,
24 #include <vnet/tcp/tcp_error.def>
25 #undef tcp_error
26 };
27 
28 /* All TCP nodes have the same outgoing arcs */
29 #define foreach_tcp_state_next \
30  _ (DROP, "error-drop") \
31  _ (TCP4_OUTPUT, "tcp4-output") \
32  _ (TCP6_OUTPUT, "tcp6-output")
33 
34 typedef enum _tcp_established_next
35 {
36 #define _(s,n) TCP_ESTABLISHED_NEXT_##s,
38 #undef _
41 
42 typedef enum _tcp_rcv_process_next
43 {
44 #define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
46 #undef _
49 
50 typedef enum _tcp_syn_sent_next
51 {
52 #define _(s,n) TCP_SYN_SENT_NEXT_##s,
54 #undef _
57 
58 typedef enum _tcp_listen_next
59 {
60 #define _(s,n) TCP_LISTEN_NEXT_##s,
62 #undef _
65 
66 /* Generic, state independent indices */
67 typedef enum _tcp_state_next
68 {
69 #define _(s,n) TCP_NEXT_##s,
71 #undef _
74 
75 #define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
76  : TCP_NEXT_TCP6_OUTPUT)
77 
80 
81 /**
82  * Validate segment sequence number. As per RFC793:
83  *
84  * Segment Receive Test
85  * Length Window
86  * ------- ------- -------------------------------------------
87  * 0 0 SEG.SEQ = RCV.NXT
88  * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
89  * >0 0 not acceptable
90  * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
91  * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
92  *
93  * This ultimately consists in checking if segment falls within the window.
94  * The one important difference compared to RFC793 is that we use rcv_las,
95  * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
96  * peer's reference when computing our receive window.
97  *
98  * This:
99  * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
100  * however, is too strict when we have retransmits. Instead we just check that
101  * the seq is not beyond the right edge and that the end of the segment is not
102  * less than the left edge.
103  *
104  * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
105  * use rcv_nxt in the right edge window test instead of rcv_las.
106  *
107  */
110 {
111  return (seq_geq (end_seq, tc->rcv_las)
112  && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
113 }
114 
115 /**
116  * Parse TCP header options.
117  *
118  * @param th TCP header
119  * @param to TCP options data structure to be populated
120  * @return -1 if parsing failed
121  */
122 int
124 {
125  const u8 *data;
126  u8 opt_len, opts_len, kind;
127  int j;
128  sack_block_t b;
129 
130  opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
131  data = (const u8 *) (th + 1);
132 
133  /* Zero out all flags but those set in SYN */
134  to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE);
135 
136  for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
137  {
138  kind = data[0];
139 
140  /* Get options length */
141  if (kind == TCP_OPTION_EOL)
142  break;
143  else if (kind == TCP_OPTION_NOOP)
144  {
145  opt_len = 1;
146  continue;
147  }
148  else
149  {
150  /* broken options */
151  if (opts_len < 2)
152  return -1;
153  opt_len = data[1];
154 
155  /* weird option length */
156  if (opt_len < 2 || opt_len > opts_len)
157  return -1;
158  }
159 
160  /* Parse options */
161  switch (kind)
162  {
163  case TCP_OPTION_MSS:
164  if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
165  {
166  to->flags |= TCP_OPTS_FLAG_MSS;
167  to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
168  }
169  break;
171  if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
172  {
173  to->flags |= TCP_OPTS_FLAG_WSCALE;
174  to->wscale = data[2];
175  if (to->wscale > TCP_MAX_WND_SCALE)
176  {
177  clib_warning ("Illegal window scaling value: %d",
178  to->wscale);
180  }
181  }
182  break;
184  if (opt_len == TCP_OPTION_LEN_TIMESTAMP)
185  {
186  to->flags |= TCP_OPTS_FLAG_TSTAMP;
187  to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
188  to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
189  }
190  break;
192  if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
193  to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
194  break;
196  /* If SACK permitted was not advertised or a SYN, break */
197  if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
198  break;
199 
200  /* If too short or not correctly formatted, break */
201  if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
202  break;
203 
204  to->flags |= TCP_OPTS_FLAG_SACK;
205  to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
206  vec_reset_length (to->sacks);
207  for (j = 0; j < to->n_sack_blocks; j++)
208  {
209  b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
210  b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
211  vec_add1 (to->sacks, b);
212  }
213  break;
214  default:
215  /* Nothing to see here */
216  continue;
217  }
218  }
219  return 0;
220 }
221 
222 /**
223  * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
224  * timestamp to echo and it's less than tsval_recent, drop segment
225  * but still send an ACK in order to retain TCP's mechanism for detecting
226  * and recovering from half-open connections
227  *
228  * Or at least that's what the theory says. It seems that this might not work
229  * very well with packet reordering and fast retransmit. XXX
230  */
231 always_inline int
233 {
234  return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
235  && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
236 }
237 
238 /**
239  * Update tsval recent
240  */
241 always_inline void
243 {
244  /*
245  * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
246  * of an incoming segment:
247  * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
248  * then the TSval from the segment is copied to TS.Recent;
249  * otherwise, the TSval is ignored.
250  */
251  if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las)
252  && seq_leq (tc->rcv_las, seq_end))
253  {
254  ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
255  tc->tsval_recent = tc->rcv_opts.tsval;
256  tc->tsval_recent_age = tcp_time_now ();
257  }
258 }
259 
260 /**
261  * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
262  *
263  * It first verifies if segment has a wrapped sequence number (PAWS) and then
264  * does the processing associated to the first four steps (ignoring security
265  * and precedence): sequence number, rst bit and syn bit checks.
266  *
267  * @return 0 if segments passes validation.
268  */
269 static int
271  vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0)
272 {
273  if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
274  return -1;
275 
276  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts)))
277  {
278  clib_warning ("options parse error");
279  return -1;
280  }
281 
282  if (tcp_segment_check_paws (tc0))
283  {
284  if (CLIB_DEBUG > 2)
285  {
286  clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
287  clib_warning ("seq %u seq_end %u ack %u",
288  vnet_buffer (b0)->tcp.seq_number - tc0->irs,
289  vnet_buffer (b0)->tcp.seq_end - tc0->irs,
290  vnet_buffer (b0)->tcp.ack_number - tc0->iss);
291  }
292  TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
293  vnet_buffer (b0)->tcp.seq_end);
294 
295  /* If it just so happens that a segment updates tsval_recent for a
296  * segment over 24 days old, invalidate tsval_recent. */
297  if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
298  tcp_time_now ()))
299  {
300  /* Age isn't reset until we get a valid tsval (bsd inspired) */
301  tc0->tsval_recent = 0;
302  clib_warning ("paws failed - really old segment. REALLY?");
303  }
304  else
305  {
306  /* Drop after ack if not rst */
307  if (!tcp_rst (th0))
308  {
309  tcp_make_ack (tc0, b0);
310  *next0 = tcp_next_output (tc0->c_is_ip4);
311  TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
312  return -1;
313  }
314  }
315  }
316 
317  /* 1st: check sequence number */
318  if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
319  vnet_buffer (b0)->tcp.seq_end))
320  {
321  /* If our window is 0 and the packet is in sequence, let it pass
322  * through for ack processing. It should be dropped later.*/
323  if (tc0->rcv_wnd == 0
324  && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
325  {
326  /* TODO Should segment be tagged? */
327  }
328  else
329  {
330  /* If not RST, send dup ack */
331  if (!tcp_rst (th0))
332  {
333  tcp_make_ack (tc0, b0);
334  *next0 = tcp_next_output (tc0->c_is_ip4);
335  TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
336  }
337  return -1;
338  }
339  }
340 
341  /* 2nd: check the RST bit */
342  if (tcp_rst (th0))
343  {
344  tcp_connection_reset (tc0);
345  return -1;
346  }
347 
348  /* 3rd: check security and precedence (skip) */
349 
350  /* 4th: check the SYN bit */
351  if (tcp_syn (th0))
352  {
353  /* TODO implement RFC 5961 */
354  if (tc0->state == TCP_STATE_SYN_RCVD)
355  {
356  tcp_make_synack (tc0, b0);
357  TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
358  }
359  else
360  {
361  tcp_make_ack (tc0, b0);
362  TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0);
363  }
364  *next0 = tcp_next_output (tc0->c_is_ip4);
365  return -1;
366  }
367 
368  /* If segment in window, save timestamp */
369  tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
370  vnet_buffer (b0)->tcp.seq_end);
371  return 0;
372 }
373 
374 always_inline int
376 {
377  /* SND.UNA =< SEG.ACK =< SND.NXT */
378  return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number)
379  && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt));
380 }
381 
382 /**
383  * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
384  *
385  * Note that although the original article, srtt and rttvar are scaled
386  * to minimize round-off errors, here we don't. Instead, we rely on
387  * better precision time measurements.
388  *
389  * TODO support us rtt resolution
390  */
391 static void
393 {
394  int err, diff;
395 
396  if (tc->srtt != 0)
397  {
398  err = mrtt - tc->srtt;
399 
400  /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
401  * The increase should be bound */
402  tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
403  diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
404  tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
405  }
406  else
407  {
408  /* First measurement. */
409  tc->srtt = mrtt;
410  tc->rttvar = mrtt >> 1;
411  }
412 }
413 
414 void
416 {
417  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
418  tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
419 }
420 
421 /**
422  * Update RTT estimate and RTO timer
423  *
424  * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
425  * timing. Middle boxes are known to fiddle with TCP options so we
426  * should give higher priority to ACK timing.
427  *
428  * This should be called only if previously sent bytes have been acked.
429  *
430  * return 1 if valid rtt 0 otherwise
431  */
432 static int
434 {
435  u32 mrtt = 0;
436 
437  /* Karn's rule, part 1. Don't use retransmitted segments to estimate
438  * RTT because they're ambiguous. */
439  if (tcp_in_cong_recovery (tc) || tc->sack_sb.sacked_bytes)
440  goto done;
441 
442  if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
443  {
444  mrtt = tcp_time_now () - tc->rtt_ts;
445  }
446  /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
447  * snd_una, i.e., the left side of the send window:
448  * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
449  else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
450  {
451  mrtt = tcp_time_now () - tc->rcv_opts.tsecr;
452  }
453 
454  /* Ignore dubious measurements */
455  if (mrtt == 0 || mrtt > TCP_RTT_MAX)
456  goto done;
457 
458  tcp_estimate_rtt (tc, mrtt);
459 
460 done:
461 
462  /* Allow measuring of a new RTT */
463  tc->rtt_ts = 0;
464 
465  /* If we got here something must've been ACKed so make sure boff is 0,
466  * even if mrrt is not valid since we update the rto lower */
467  tc->rto_boff = 0;
468  tcp_update_rto (tc);
469 
470  return 0;
471 }
472 
473 /**
474  * Dequeue bytes that have been acked and while at it update RTT estimates.
475  */
476 static void
478 {
479  /* Dequeue the newly ACKed add SACKed bytes */
480  stream_session_dequeue_drop (&tc->connection,
481  tc->bytes_acked + tc->sack_sb.snd_una_adv);
482 
483  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
484 
485  /* Update rtt and rto */
486  tcp_update_rtt (tc, ack);
487 
488  /* If everything has been acked, stop retransmit timer
489  * otherwise update. */
491 }
492 
493 /**
494  * Check if duplicate ack as per RFC5681 Sec. 2
495  */
496 static u8
498  u32 prev_snd_una)
499 {
500  return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
501  && seq_gt (tc->snd_una_max, tc->snd_una)
502  && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
503  && (prev_snd_wnd == tc->snd_wnd));
504 }
505 
506 /**
507  * Checks if ack is a congestion control event.
508  */
509 static u8
511  u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
512 {
513  /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
514  * defined to be 'duplicate' */
515  *is_dack = tc->sack_sb.last_sacked_bytes
516  || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
517 
518  return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc));
519 }
520 
521 void
523 {
524  sack_scoreboard_hole_t *next, *prev;
525 
526  if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
527  {
528  next = pool_elt_at_index (sb->holes, hole->next);
529  next->prev = hole->prev;
530  }
531  else
532  {
533  sb->tail = hole->prev;
534  }
535 
536  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
537  {
538  prev = pool_elt_at_index (sb->holes, hole->prev);
539  prev->next = hole->next;
540  }
541  else
542  {
543  sb->head = hole->next;
544  }
545 
546  if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
547  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
548 
549  /* Poison the entry */
550  if (CLIB_DEBUG > 0)
551  memset (hole, 0xfe, sizeof (*hole));
552 
553  pool_put (sb->holes, hole);
554 }
555 
558  u32 start, u32 end)
559 {
560  sack_scoreboard_hole_t *hole, *next, *prev;
561  u32 hole_index;
562 
563  pool_get (sb->holes, hole);
564  memset (hole, 0, sizeof (*hole));
565 
566  hole->start = start;
567  hole->end = end;
568  hole_index = scoreboard_hole_index (sb, hole);
569 
570  prev = scoreboard_get_hole (sb, prev_index);
571  if (prev)
572  {
573  hole->prev = prev_index;
574  hole->next = prev->next;
575 
576  if ((next = scoreboard_next_hole (sb, hole)))
577  next->prev = hole_index;
578  else
579  sb->tail = hole_index;
580 
581  prev->next = hole_index;
582  }
583  else
584  {
585  sb->head = hole_index;
586  hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
587  hole->next = TCP_INVALID_SACK_HOLE_INDEX;
588  }
589 
590  return hole;
591 }
592 
593 void
595 {
596  sack_scoreboard_hole_t *hole, *prev;
597  u32 bytes = 0, blks = 0;
598 
599  sb->lost_bytes = 0;
600  sb->sacked_bytes = 0;
601  hole = scoreboard_last_hole (sb);
602  if (!hole)
603  return;
604 
605  if (seq_gt (sb->high_sacked, hole->end))
606  {
607  bytes = sb->high_sacked - hole->end;
608  blks = 1;
609  }
610 
611  while ((prev = scoreboard_prev_hole (sb, hole))
612  && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
613  && blks < TCP_DUPACK_THRESHOLD))
614  {
615  bytes += hole->start - prev->end;
616  blks++;
617  hole = prev;
618  }
619 
620  while (hole)
621  {
622  sb->lost_bytes += scoreboard_hole_bytes (hole);
623  hole->is_lost = 1;
624  prev = hole;
625  hole = scoreboard_prev_hole (sb, hole);
626  if (hole)
627  bytes += prev->start - hole->end;
628  }
629  sb->sacked_bytes = bytes;
630 }
631 
632 /**
633  * Figure out the next hole to retransmit
634  *
635  * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
636  */
639  sack_scoreboard_hole_t * start,
640  u8 have_sent_1_smss,
641  u8 * can_rescue, u8 * snd_limited)
642 {
643  sack_scoreboard_hole_t *hole = 0;
644 
645  hole = start ? start : scoreboard_first_hole (sb);
646  while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
647  hole = scoreboard_next_hole (sb, hole);
648 
649  /* Nothing, return */
650  if (!hole)
651  {
652  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
653  return 0;
654  }
655 
656  /* Rule (1): if higher than rxt, less than high_sacked and lost */
657  if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
658  {
659  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
660  }
661  else
662  {
663  /* Rule (2): output takes care of transmitting new data */
664  if (!have_sent_1_smss)
665  {
666  hole = 0;
667  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
668  }
669  /* Rule (3): if hole not lost */
670  else if (seq_lt (hole->start, sb->high_sacked))
671  {
672  *snd_limited = 1;
673  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
674  }
675  /* Rule (4): if hole beyond high_sacked */
676  else
677  {
678  ASSERT (seq_geq (hole->start, sb->high_sacked));
679  *snd_limited = 1;
680  *can_rescue = 1;
681  /* HighRxt MUST NOT be updated */
682  return 0;
683  }
684  }
685 
686  if (hole && seq_lt (sb->high_rxt, hole->start))
687  sb->high_rxt = hole->start;
688 
689  return hole;
690 }
691 
692 void
694 {
696  hole = scoreboard_first_hole (sb);
697  if (hole)
698  {
699  seq = seq_gt (seq, hole->start) ? seq : hole->start;
700  sb->cur_rxt_hole = sb->head;
701  }
702  sb->high_rxt = seq;
703 }
704 
705 /**
706  * Test that scoreboard is sane after recovery
707  *
708  * Returns 1 if scoreboard is empty or if first hole beyond
709  * snd_una.
710  */
711 u8
713 {
715  hole = scoreboard_first_hole (&tc->sack_sb);
716  return (!hole || seq_geq (hole->start, tc->snd_una));
717 }
718 
719 void
721 {
722  sack_scoreboard_t *sb = &tc->sack_sb;
723  sack_block_t *blk, tmp;
724  sack_scoreboard_hole_t *hole, *next_hole, *last_hole;
725  u32 blk_index = 0, old_sacked_bytes, hole_index;
726  int i, j;
727 
728  sb->last_sacked_bytes = 0;
729  sb->snd_una_adv = 0;
730  old_sacked_bytes = sb->sacked_bytes;
731  sb->last_bytes_delivered = 0;
732 
733  if (!tcp_opts_sack (&tc->rcv_opts)
734  && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
735  return;
736 
737  /* Remove invalid blocks */
738  blk = tc->rcv_opts.sacks;
739  while (blk < vec_end (tc->rcv_opts.sacks))
740  {
741  if (seq_lt (blk->start, blk->end)
742  && seq_gt (blk->start, tc->snd_una)
743  && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_una_max))
744  {
745  blk++;
746  continue;
747  }
748  vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
749  }
750 
751  /* Add block for cumulative ack */
752  if (seq_gt (ack, tc->snd_una))
753  {
754  tmp.start = tc->snd_una;
755  tmp.end = ack;
756  vec_add1 (tc->rcv_opts.sacks, tmp);
757  }
758 
759  if (vec_len (tc->rcv_opts.sacks) == 0)
760  return;
761 
762  tcp_scoreboard_trace_add (tc, ack);
763 
764  /* Make sure blocks are ordered */
765  for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++)
766  for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++)
767  if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start))
768  {
769  tmp = tc->rcv_opts.sacks[i];
770  tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j];
771  tc->rcv_opts.sacks[j] = tmp;
772  }
773 
774  if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
775  {
776  /* If no holes, insert the first that covers all outstanding bytes */
778  tc->snd_una, tc->snd_una_max);
779  sb->tail = scoreboard_hole_index (sb, last_hole);
780  tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
781  sb->high_sacked = tmp.end;
782  }
783  else
784  {
785  /* If we have holes but snd_una_max is beyond the last hole, update
786  * last hole end */
787  tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
788  last_hole = scoreboard_last_hole (sb);
789  if (seq_gt (tc->snd_una_max, last_hole->end))
790  {
791  if (seq_geq (last_hole->start, sb->high_sacked))
792  {
793  last_hole->end = tc->snd_una_max;
794  }
795  /* New hole after high sacked block */
796  else if (seq_lt (sb->high_sacked, tc->snd_una_max))
797  {
798  scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
799  tc->snd_una_max);
800  }
801  }
802  /* Keep track of max byte sacked for when the last hole
803  * is acked */
804  if (seq_gt (tmp.end, sb->high_sacked))
805  sb->high_sacked = tmp.end;
806  }
807 
808  /* Walk the holes with the SACK blocks */
809  hole = pool_elt_at_index (sb->holes, sb->head);
810  while (hole && blk_index < vec_len (tc->rcv_opts.sacks))
811  {
812  blk = &tc->rcv_opts.sacks[blk_index];
813  if (seq_leq (blk->start, hole->start))
814  {
815  /* Block covers hole. Remove hole */
816  if (seq_geq (blk->end, hole->end))
817  {
818  next_hole = scoreboard_next_hole (sb, hole);
819 
820  /* Byte accounting: snd_una needs to be advanced */
821  if (blk->end == ack)
822  {
823  if (next_hole)
824  {
825  if (seq_lt (ack, next_hole->start))
826  sb->snd_una_adv = next_hole->start - ack;
827  sb->last_bytes_delivered +=
828  next_hole->start - hole->end;
829  }
830  else
831  {
832  ASSERT (seq_geq (sb->high_sacked, ack));
833  sb->snd_una_adv = sb->high_sacked - ack;
834  sb->last_bytes_delivered += sb->high_sacked - hole->end;
835  }
836  }
837 
838  scoreboard_remove_hole (sb, hole);
839  hole = next_hole;
840  }
841  /* Partial 'head' overlap */
842  else
843  {
844  if (seq_gt (blk->end, hole->start))
845  {
846  hole->start = blk->end;
847  }
848  blk_index++;
849  }
850  }
851  else
852  {
853  /* Hole must be split */
854  if (seq_lt (blk->end, hole->end))
855  {
856  hole_index = scoreboard_hole_index (sb, hole);
857  next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
858  hole->end);
859 
860  /* Pool might've moved */
861  hole = scoreboard_get_hole (sb, hole_index);
862  hole->end = blk->start;
863  blk_index++;
864  ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
865  }
866  else if (seq_lt (blk->start, hole->end))
867  {
868  hole->end = blk->start;
869  }
870  hole = scoreboard_next_hole (sb, hole);
871  }
872  }
873 
874  scoreboard_update_bytes (tc, sb);
875  sb->last_sacked_bytes = sb->sacked_bytes
876  - (old_sacked_bytes - sb->last_bytes_delivered);
877  ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes);
878  ASSERT (sb->sacked_bytes == 0
879  || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
880  ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
881  - seq_max (tc->snd_una, ack));
883  || sb->holes[sb->head].start == ack + sb->snd_una_adv);
884 }
885 
886 /**
887  * Try to update snd_wnd based on feedback received from peer.
888  *
889  * If successful, and new window is 'effectively' 0, activate persist
890  * timer.
891  */
892 static void
893 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
894 {
895  /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
896  * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
897  if (seq_lt (tc->snd_wl1, seq)
898  || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
899  {
900  tc->snd_wnd = snd_wnd;
901  tc->snd_wl1 = seq;
902  tc->snd_wl2 = ack;
903  TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
904 
905  if (tc->snd_wnd < tc->snd_mss)
906  {
907  /* Set persist timer if not set and we just got 0 wnd */
908  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
909  && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
911  }
912  else
913  {
915  if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
916  {
917  tc->rto_boff = 0;
918  tcp_update_rto (tc);
919  }
920  }
921  }
922 }
923 
924 void
926 {
927  tcp_fastrecovery_on (tc);
928  tc->snd_congestion = tc->snd_una_max;
929  tc->cc_algo->congestion (tc);
930  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
931 }
932 
933 static void
935 {
936  /* Deflate rto */
937  tc->rto_boff = 0;
938  tcp_update_rto (tc);
939  tc->snd_rxt_ts = 0;
940  tc->snd_nxt = tc->snd_una_max;
941  tcp_recovery_off (tc);
942  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
943 }
944 
945 void
947 {
948  tc->cc_algo->recovered (tc);
949  tc->snd_rxt_bytes = 0;
950  tc->rcv_dupacks = 0;
951  tc->snd_nxt = tc->snd_una_max;
954  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
955 }
956 
957 static void
959 {
960  tc->cwnd = tc->prev_cwnd;
961  tc->ssthresh = tc->prev_ssthresh;
962  tc->snd_nxt = tc->snd_una_max;
963  tc->rcv_dupacks = 0;
964  if (tcp_in_recovery (tc))
966  ASSERT (tc->rto_boff == 0);
967  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
968  /* TODO extend for fastrecovery */
969 }
970 
971 static u8
973 {
974  return (tcp_in_recovery (tc) && tc->rto_boff == 1
975  && tc->snd_rxt_ts
976  && tcp_opts_tstamp (&tc->rcv_opts)
977  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
978 }
979 
980 int
982 {
985  {
987  return 1;
988  }
989 
990  if (tcp_in_recovery (tc))
992  else if (tcp_in_fastrecovery (tc))
994 
995  ASSERT (tc->rto_boff == 0);
998  return 0;
999 }
1000 
1001 static void
1003 {
1005 
1006  /* Congestion avoidance */
1007  tc->cc_algo->rcv_ack (tc);
1008  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1009 
1010  /* If a cumulative ack, make sure dupacks is 0 */
1011  tc->rcv_dupacks = 0;
1012 
1013  /* When dupacks hits the threshold we only enter fast retransmit if
1014  * cumulative ack covers more than snd_congestion. Should snd_una
1015  * wrap this test may fail under otherwise valid circumstances.
1016  * Therefore, proactively update snd_congestion when wrap detected. */
1017  if (PREDICT_FALSE
1018  (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
1019  && seq_gt (tc->snd_congestion, tc->snd_una)))
1020  tc->snd_congestion = tc->snd_una - 1;
1021 }
1022 
1023 static u8
1025 {
1026  return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes;
1027 }
1028 
1029 static u8
1031 {
1032  return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD
1033  || tcp_should_fastrecover_sack (tc));
1034 }
1035 
1036 /**
1037  * One function to rule them all ... and in the darkness bind them
1038  */
1039 static void
1041 {
1042  u32 rxt_delivered;
1043 
1044  /*
1045  * Duplicate ACK. Check if we should enter fast recovery, or if already in
1046  * it account for the bytes that left the network.
1047  */
1048  if (is_dack)
1049  {
1050  ASSERT (tc->snd_una != tc->snd_una_max
1051  || tc->sack_sb.last_sacked_bytes);
1052 
1053  tc->rcv_dupacks++;
1054 
1055  if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
1056  {
1057  ASSERT (tcp_in_fastrecovery (tc));
1058  /* Pure duplicate ack. If some data got acked, it's handled lower */
1059  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
1060  return;
1061  }
1062  else if (tcp_should_fastrecover (tc))
1063  {
1064  /* Things are already bad */
1065  if (tcp_in_cong_recovery (tc))
1066  {
1067  tc->rcv_dupacks = 0;
1068  goto partial_ack_test;
1069  }
1070 
1071  /* If of of the two conditions lower hold, reset dupacks because
1072  * we're probably after timeout (RFC6582 heuristics).
1073  * If Cumulative ack does not cover more than congestion threshold,
1074  * and:
1075  * 1) The following doesn't hold: The congestion window is greater
1076  * than SMSS bytes and the difference between highest_ack
1077  * and prev_highest_ack is at most 4*SMSS bytes
1078  * 2) Echoed timestamp in the last non-dup ack does not equal the
1079  * stored timestamp
1080  */
1081  if (seq_leq (tc->snd_una, tc->snd_congestion)
1082  && ((!(tc->cwnd > tc->snd_mss
1083  && tc->bytes_acked <= 4 * tc->snd_mss))
1084  || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
1085  {
1086  tc->rcv_dupacks = 0;
1087  return;
1088  }
1089 
1091  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
1092 
1093  /* The first segment MUST be retransmitted */
1095 
1096  /* Post retransmit update cwnd to ssthresh and account for the
1097  * three segments that have left the network and should've been
1098  * buffered at the receiver XXX */
1099  tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
1100  ASSERT (tc->cwnd >= tc->snd_mss);
1101 
1102  /* If cwnd allows, send more data */
1103  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1104  {
1105  scoreboard_init_high_rxt (&tc->sack_sb,
1106  tc->snd_una + tc->snd_mss);
1108  }
1109  else
1110  {
1112  }
1113 
1114  return;
1115  }
1116  else if (!tc->bytes_acked
1117  || (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
1118  {
1119  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
1120  return;
1121  }
1122  else
1123  goto partial_ack;
1124  }
1125 
1126 partial_ack_test:
1127 
1128  if (!tc->bytes_acked)
1129  return;
1130 
1131 partial_ack:
1132  /*
1133  * Legitimate ACK. 1) See if we can exit recovery
1134  */
1135  /* XXX limit this only to first partial ack? */
1137 
1138  if (seq_geq (tc->snd_una, tc->snd_congestion))
1139  {
1140  /* If spurious return, we've already updated everything */
1141  if (tcp_cc_recover (tc))
1142  {
1143  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1144  return;
1145  }
1146 
1147  tc->snd_nxt = tc->snd_una_max;
1148 
1149  /* Treat as congestion avoidance ack */
1150  tc->cc_algo->rcv_ack (tc);
1151  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
1152  return;
1153  }
1154 
1155  /*
1156  * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
1157  */
1158  TCP_EVT_DBG (TCP_EVT_CC_PACK, tc);
1159 
1160  /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
1161  * reset dupacks to 0 */
1162  tc->rcv_dupacks = 0;
1163 
1165 
1166  /* Post RTO timeout don't try anything fancy */
1167  if (tcp_in_recovery (tc))
1168  return;
1169 
1170  /* Remove retransmitted bytes that have been delivered */
1171  ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
1172  >= tc->sack_sb.last_bytes_delivered
1173  || (tc->flags & TCP_CONN_FINSNT));
1174 
1175  if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
1176  {
1177  /* If we have sacks and we haven't gotten an ack beyond high_rxt,
1178  * remove sacked bytes delivered */
1179  rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
1180  - tc->sack_sb.last_bytes_delivered;
1181  ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
1182  tc->snd_rxt_bytes -= rxt_delivered;
1183  }
1184  else
1185  {
1186  /* Either all retransmitted holes have been acked, or we're
1187  * "in the blind" and retransmitting segment by segment */
1188  tc->snd_rxt_bytes = 0;
1189  }
1190 
1191  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
1192 
1193  /*
1194  * Since this was a partial ack, try to retransmit some more data
1195  */
1196  tcp_fast_retransmit (tc);
1197 }
1198 
1199 void
1201 {
1202  tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO);
1203  tc->cc_algo->init (tc);
1204 }
1205 
1206 /**
1207  * Process incoming ACK
1208  */
1209 static int
1211  tcp_header_t * th, u32 * next, u32 * error)
1212 {
1213  u32 prev_snd_wnd, prev_snd_una;
1214  u8 is_dack;
1215 
1216  TCP_EVT_DBG (TCP_EVT_CC_STAT, tc);
1217 
1218  /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
1219  if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
1220  {
1221  /* If we have outstanding data and this is within the window, accept it,
1222  * probably retransmit has timed out. Otherwise ACK segment and then
1223  * drop it */
1224  if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max))
1225  {
1226  tcp_make_ack (tc, b);
1227  *next = tcp_next_output (tc->c_is_ip4);
1228  *error = TCP_ERROR_ACK_INVALID;
1229  TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
1230  vnet_buffer (b)->tcp.ack_number);
1231  return -1;
1232  }
1233 
1234  TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
1235  vnet_buffer (b)->tcp.ack_number);
1236 
1237  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
1238  *error = TCP_ERROR_ACK_FUTURE;
1239  }
1240 
1241  /* If old ACK, probably it's an old dupack */
1242  if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
1243  {
1244  *error = TCP_ERROR_ACK_OLD;
1245  TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
1246  vnet_buffer (b)->tcp.ack_number);
1247  if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
1248  {
1249  TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc);
1250  tcp_cc_handle_event (tc, 1);
1251  }
1252  /* Don't drop yet */
1253  return 0;
1254  }
1255 
1256  /*
1257  * Looks okay, process feedback
1258  */
1259 
1260  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1261  tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
1262 
1263  prev_snd_wnd = tc->snd_wnd;
1264  prev_snd_una = tc->snd_una;
1265  tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
1266  vnet_buffer (b)->tcp.ack_number,
1267  clib_net_to_host_u16 (th->window) << tc->snd_wscale);
1268  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
1269  tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv;
1270  tcp_validate_txf_size (tc, tc->bytes_acked);
1271 
1272  if (tc->bytes_acked)
1273  tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
1274 
1275  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
1276 
1277  /*
1278  * Check if we have congestion event
1279  */
1280 
1281  if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
1282  {
1283  tcp_cc_handle_event (tc, is_dack);
1284  if (!tcp_in_cong_recovery (tc))
1285  return 0;
1286  *error = TCP_ERROR_ACK_DUP;
1287  TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
1288  return vnet_buffer (b)->tcp.data_len ? 0 : -1;
1289  }
1290 
1291  /*
1292  * Update congestion control (slow start/congestion avoidance)
1293  */
1294  tcp_cc_update (tc, b);
1295 
1296  return 0;
1297 }
1298 
1299 static u8
1301 {
1302  int i;
1303  for (i = 1; i < vec_len (sacks); i++)
1304  {
1305  if (sacks[i - 1].end == sacks[i].start)
1306  return 0;
1307  }
1308  return 1;
1309 }
1310 
1311 /**
1312  * Build SACK list as per RFC2018.
1313  *
1314  * Makes sure the first block contains the segment that generated the current
1315  * ACK and the following ones are the ones most recently reported in SACK
1316  * blocks.
1317  *
1318  * @param tc TCP connection for which the SACK list is updated
1319  * @param start Start sequence number of the newest SACK block
1320  * @param end End sequence of the newest SACK block
1321  */
1322 void
1324 {
1325  sack_block_t *new_list = 0, *block = 0;
1326  int i;
1327 
1328  /* If the first segment is ooo add it to the list. Last write might've moved
1329  * rcv_nxt over the first segment. */
1330  if (seq_lt (tc->rcv_nxt, start))
1331  {
1332  vec_add2 (new_list, block, 1);
1333  block->start = start;
1334  block->end = end;
1335  }
1336 
1337  /* Find the blocks still worth keeping. */
1338  for (i = 0; i < vec_len (tc->snd_sacks); i++)
1339  {
1340  /* Discard if rcv_nxt advanced beyond current block */
1341  if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
1342  continue;
1343 
1344  /* Merge or drop if segment overlapped by the new segment */
1345  if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
1346  && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
1347  {
1348  if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
1349  new_list[0].start = tc->snd_sacks[i].start;
1350  if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
1351  new_list[0].end = tc->snd_sacks[i].end;
1352  continue;
1353  }
1354 
1355  /* Save to new SACK list if we have space. */
1356  if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
1357  {
1358  vec_add1 (new_list, tc->snd_sacks[i]);
1359  }
1360  else
1361  {
1362  clib_warning ("sack discarded");
1363  }
1364  }
1365 
1366  ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
1367 
1368  /* Replace old vector with new one */
1369  vec_free (tc->snd_sacks);
1370  tc->snd_sacks = new_list;
1371 
1372  /* Segments should not 'touch' */
1373  ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
1374 }
1375 
1376 /** Enqueue data for delivery to application */
1377 always_inline int
1379  u16 data_len)
1380 {
1381  int written, error = TCP_ERROR_ENQUEUED;
1382 
1383  ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1384 
1385  /* Pure ACK. Update rcv_nxt and be done. */
1386  if (PREDICT_FALSE (data_len == 0))
1387  {
1388  return TCP_ERROR_PURE_ACK;
1389  }
1390 
1391  written = stream_session_enqueue_data (&tc->connection, b, 0,
1392  1 /* queue event */ , 1);
1393 
1394  TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written);
1395 
1396  /* Update rcv_nxt */
1397  if (PREDICT_TRUE (written == data_len))
1398  {
1399  tc->rcv_nxt += written;
1400  }
1401  /* If more data written than expected, account for out-of-order bytes. */
1402  else if (written > data_len)
1403  {
1404  tc->rcv_nxt += written;
1405 
1406  /* Send ACK confirming the update */
1407  tc->flags |= TCP_CONN_SNDACK;
1408  }
1409  else if (written > 0)
1410  {
1411  /* We've written something but FIFO is probably full now */
1412  tc->rcv_nxt += written;
1413 
1414  /* Depending on how fast the app is, all remaining buffers in burst will
1415  * not be enqueued. Inform peer */
1416  tc->flags |= TCP_CONN_SNDACK;
1417 
1418  error = TCP_ERROR_PARTIALLY_ENQUEUED;
1419  }
1420  else
1421  {
1422  tc->flags |= TCP_CONN_SNDACK;
1423  return TCP_ERROR_FIFO_FULL;
1424  }
1425 
1426  /* Update SACK list if need be */
1427  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1428  {
1429  /* Remove SACK blocks that have been delivered */
1430  tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
1431  }
1432 
1433  return error;
1434 }
1435 
1436 /** Enqueue out-of-order data */
1437 always_inline int
1439  u16 data_len)
1440 {
1441  stream_session_t *s0;
1442  int rv, offset;
1443 
1444  ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1445 
1446  /* Pure ACK. Do nothing */
1447  if (PREDICT_FALSE (data_len == 0))
1448  {
1449  return TCP_ERROR_PURE_ACK;
1450  }
1451 
1452  /* Enqueue out-of-order data with relative offset */
1453  rv = stream_session_enqueue_data (&tc->connection, b,
1454  vnet_buffer (b)->tcp.seq_number -
1455  tc->rcv_nxt, 0 /* queue event */ , 0);
1456 
1457  /* Nothing written */
1458  if (rv)
1459  {
1460  TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0);
1461  return TCP_ERROR_FIFO_FULL;
1462  }
1463 
1464  TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len);
1465 
1466  /* Update SACK list if in use */
1467  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1468  {
1469  ooo_segment_t *newest;
1470  u32 start, end;
1471 
1472  s0 = stream_session_get (tc->c_s_index, tc->c_thread_index);
1473 
1474  /* Get the newest segment from the fifo */
1475  newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo);
1476  if (newest)
1477  {
1478  offset = ooo_segment_offset (s0->server_rx_fifo, newest);
1479  ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
1480  start = tc->rcv_nxt + offset;
1481  end = start + ooo_segment_length (s0->server_rx_fifo, newest);
1482  tcp_update_sack_list (tc, start, end);
1483  svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo);
1484  }
1485  }
1486 
1487  return TCP_ERROR_ENQUEUED;
1488 }
1489 
1490 /**
1491  * Check if ACK could be delayed. If ack can be delayed, it should return
1492  * true for a full frame. If we're always acking return 0.
1493  */
1494 always_inline int
1496 {
1497  /* Send ack if ... */
1498  if (TCP_ALWAYS_ACK
1499  /* just sent a rcv wnd 0 */
1500  || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0
1501  /* constrained to send ack */
1502  || (tc->flags & TCP_CONN_SNDACK) != 0
1503  /* we're almost out of tx wnd */
1504  || tcp_available_snd_space (tc) < 4 * tc->snd_mss)
1505  return 0;
1506 
1507  return 1;
1508 }
1509 
1510 static int
1512 {
1513  u32 discard, first = b->current_length;
1515 
1516  /* Handle multi-buffer segments */
1517  if (n_bytes_to_drop > b->current_length)
1518  {
1519  if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
1520  return -1;
1521  do
1522  {
1523  discard = clib_min (n_bytes_to_drop, b->current_length);
1524  vlib_buffer_advance (b, discard);
1525  b = vlib_get_buffer (vm, b->next_buffer);
1526  n_bytes_to_drop -= discard;
1527  }
1528  while (n_bytes_to_drop);
1529  if (n_bytes_to_drop > first)
1530  b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
1531  }
1532  else
1533  vlib_buffer_advance (b, n_bytes_to_drop);
1534  vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
1535  return 0;
1536 }
1537 
1538 static int
1540  u32 * next0)
1541 {
1542  u32 error = 0, n_bytes_to_drop, n_data_bytes;
1543 
1544  vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
1545  n_data_bytes = vnet_buffer (b)->tcp.data_len;
1546  ASSERT (n_data_bytes);
1547 
1548  /* Handle out-of-order data */
1549  if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
1550  {
1551  /* Old sequence numbers allowed through because they overlapped
1552  * the rx window */
1553  if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
1554  {
1555  error = TCP_ERROR_SEGMENT_OLD;
1556  *next0 = TCP_NEXT_DROP;
1557 
1558  /* Completely in the past (possible retransmit) */
1559  if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
1560  {
1561  /* Ack retransmissions since we may not have any data to send */
1562  tcp_make_ack (tc, b);
1563  *next0 = tcp_next_output (tc->c_is_ip4);
1564  goto done;
1565  }
1566 
1567  /* Chop off the bytes in the past */
1568  n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
1569  n_data_bytes -= n_bytes_to_drop;
1570  vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
1571  if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
1572  goto done;
1573 
1574  goto in_order;
1575  }
1576 
1577  error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
1578 
1579  /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open
1580  * cwnd on remote peer when congested 2) acks leaving should have the
1581  * latest rcv_wnd since the burst may eaten up all of it, so only the
1582  * old ones could be filtered.
1583  */
1584 
1585  /* RFC2581: Send DUPACK for fast retransmit */
1586  tcp_make_ack (tc, b);
1587  *next0 = tcp_next_output (tc->c_is_ip4);
1588 
1589  /* Mark as DUPACK. We may filter these in output if
1590  * the burst fills the holes. */
1591  if (n_data_bytes)
1592  vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK;
1593 
1594  TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc);
1595  goto done;
1596  }
1597 
1598 in_order:
1599 
1600  /* In order data, enqueue. Fifo figures out by itself if any out-of-order
1601  * segments can be enqueued after fifo tail offset changes. */
1602  error = tcp_session_enqueue_data (tc, b, n_data_bytes);
1603 
1604  /* Check if ACK can be delayed */
1605  if (tcp_can_delack (tc))
1606  {
1607  if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
1608  tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
1609  goto done;
1610  }
1611 
1612  *next0 = tcp_next_output (tc->c_is_ip4);
1613  tcp_make_ack (tc, b);
1614 
1615 done:
1616  return error;
1617 }
1618 
1619 typedef struct
1620 {
1623 } tcp_rx_trace_t;
1624 
1625 u8 *
1626 format_tcp_rx_trace (u8 * s, va_list * args)
1627 {
1628  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1629  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1630  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1631  uword indent = format_get_indent (s);
1632 
1633  s = format (s, "%U\n%U%U",
1634  format_tcp_header, &t->tcp_header, 128,
1635  format_white_space, indent,
1637 
1638  return s;
1639 }
1640 
1641 u8 *
1642 format_tcp_rx_trace_short (u8 * s, va_list * args)
1643 {
1644  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1645  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1646  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1647 
1648  s = format (s, "%d -> %d (%U)",
1649  clib_net_to_host_u16 (t->tcp_header.src_port),
1650  clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state,
1651  t->tcp_connection.state);
1652 
1653  return s;
1654 }
1655 
1656 void
1658  tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
1659 {
1660  if (tc0)
1661  {
1662  clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection));
1663  }
1664  else
1665  {
1666  th0 = tcp_buffer_hdr (b0);
1667  }
1668  clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
1669 }
1670 
1671 always_inline void
1673 {
1674  if (PREDICT_TRUE (!val))
1675  return;
1676 
1677  if (is_ip4)
1678  vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val);
1679  else
1680  vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val);
1681 }
1682 
1685  vlib_frame_t * from_frame, int is_ip4)
1686 {
1687  u32 n_left_from, next_index, *from, *to_next;
1688  u32 my_thread_index = vm->thread_index, errors = 0;
1689  tcp_main_t *tm = vnet_get_tcp_main ();
1690  u8 is_fin = 0;
1691 
1692  from = vlib_frame_vector_args (from_frame);
1693  n_left_from = from_frame->n_vectors;
1694 
1695  next_index = node->cached_next_index;
1696 
1697  while (n_left_from > 0)
1698  {
1699  u32 n_left_to_next;
1700 
1701  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1702  while (n_left_from > 0 && n_left_to_next > 0)
1703  {
1704  u32 bi0;
1705  vlib_buffer_t *b0;
1706  tcp_header_t *th0 = 0;
1707  tcp_connection_t *tc0;
1708  u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1709 
1710  bi0 = from[0];
1711  to_next[0] = bi0;
1712  from += 1;
1713  to_next += 1;
1714  n_left_from -= 1;
1715  n_left_to_next -= 1;
1716 
1717  b0 = vlib_get_buffer (vm, bi0);
1718  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
1719  my_thread_index);
1720 
1721  if (PREDICT_FALSE (tc0 == 0))
1722  {
1723  error0 = TCP_ERROR_INVALID_CONNECTION;
1724  goto done;
1725  }
1726 
1727  th0 = tcp_buffer_hdr (b0);
1728  /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
1729  * dangling reference. */
1730  is_fin = tcp_is_fin (th0);
1731 
1732  /* SYNs, FINs and data consume sequence numbers */
1733  vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
1734  + tcp_is_syn (th0) + is_fin + vnet_buffer (b0)->tcp.data_len;
1735 
1736  /* TODO header prediction fast path */
1737 
1738  /* 1-4: check SEQ, RST, SYN */
1739  if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0)))
1740  {
1741  error0 = TCP_ERROR_SEGMENT_INVALID;
1742  TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0,
1743  vnet_buffer (b0)->tcp.seq_number,
1744  vnet_buffer (b0)->tcp.seq_end);
1745  goto done;
1746  }
1747 
1748  /* 5: check the ACK field */
1749  if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
1750  goto done;
1751 
1752  /* 6: check the URG bit TODO */
1753 
1754  /* 7: process the segment text */
1755  if (vnet_buffer (b0)->tcp.data_len)
1756  error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
1757 
1758  /* 8: check the FIN bit */
1759  if (PREDICT_FALSE (is_fin))
1760  {
1761  /* Enter CLOSE-WAIT and notify session. To avoid lingering
1762  * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
1763  /* Account for the FIN if nothing else was received */
1764  if (vnet_buffer (b0)->tcp.data_len == 0)
1765  tc0->rcv_nxt += 1;
1766  tcp_make_ack (tc0, b0);
1767  next0 = tcp_next_output (tc0->c_is_ip4);
1768  tc0->state = TCP_STATE_CLOSE_WAIT;
1769  stream_session_disconnect_notify (&tc0->connection);
1770  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
1771  TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
1772  }
1773 
1774  done:
1775  b0->error = node->errors[error0];
1777  {
1778  tcp_rx_trace_t *t0 =
1779  vlib_add_trace (vm, node, b0, sizeof (*t0));
1780  tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
1781  }
1782 
1783  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1784  n_left_to_next, bi0, next0);
1785  }
1786 
1787  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1788  }
1789 
1790  errors = session_manager_flush_enqueue_events (my_thread_index);
1791  tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors);
1792  tcp_flush_frame_to_output (vm, my_thread_index, is_ip4);
1793 
1794  return from_frame->n_vectors;
1795 }
1796 
1797 static uword
1799  vlib_frame_t * from_frame)
1800 {
1801  return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
1802 }
1803 
1804 static uword
1806  vlib_frame_t * from_frame)
1807 {
1808  return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
1809 }
1810 
1811 /* *INDENT-OFF* */
1813 {
1814  .function = tcp4_established,
1815  .name = "tcp4-established",
1816  /* Takes a vector of packets. */
1817  .vector_size = sizeof (u32),
1818  .n_errors = TCP_N_ERROR,
1819  .error_strings = tcp_error_strings,
1820  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1821  .next_nodes =
1822  {
1823 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1825 #undef _
1826  },
1827  .format_trace = format_tcp_rx_trace_short,
1828 };
1829 /* *INDENT-ON* */
1830 
1832 
1833 /* *INDENT-OFF* */
1835 {
1836  .function = tcp6_established,
1837  .name = "tcp6-established",
1838  /* Takes a vector of packets. */
1839  .vector_size = sizeof (u32),
1840  .n_errors = TCP_N_ERROR,
1841  .error_strings = tcp_error_strings,
1842  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1843  .next_nodes =
1844  {
1845 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1847 #undef _
1848  },
1849  .format_trace = format_tcp_rx_trace_short,
1850 };
1851 /* *INDENT-ON* */
1852 
1853 
1855 
1858 
1859 static u8
1861 {
1863  if (!tc)
1864  return 1;
1865 
1866  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
1867  && (tc->state == TCP_STATE_LISTEN
1868  || tc->c_rmt_port == hdr->src_port));
1869 
1870  if (!is_valid)
1871  {
1872  if ((tmp =
1873  stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip,
1874  tc->c_lcl_port, tc->c_rmt_port,
1875  tc->c_transport_proto)))
1876  {
1877  if (tmp->lcl_port == hdr->dst_port
1878  && tmp->rmt_port == hdr->src_port)
1879  {
1880  clib_warning ("half-open is valid!");
1881  }
1882  }
1883  }
1884  return is_valid;
1885 }
1886 
1887 /**
1888  * Lookup transport connection
1889  */
1890 static tcp_connection_t *
1891 tcp_lookup_connection (vlib_buffer_t * b, u8 thread_index, u8 is_ip4)
1892 {
1893  tcp_header_t *tcp;
1894  transport_connection_t *tconn;
1895  tcp_connection_t *tc;
1896  if (is_ip4)
1897  {
1898  ip4_header_t *ip4;
1899  ip4 = vlib_buffer_get_current (b);
1900  tcp = ip4_next_header (ip4);
1902  &ip4->src_address,
1903  tcp->dst_port,
1904  tcp->src_port,
1905  SESSION_TYPE_IP4_TCP,
1906  thread_index);
1907  tc = tcp_get_connection_from_transport (tconn);
1908  ASSERT (tcp_lookup_is_valid (tc, tcp));
1909  }
1910  else
1911  {
1912  ip6_header_t *ip6;
1913  ip6 = vlib_buffer_get_current (b);
1914  tcp = ip6_next_header (ip6);
1916  &ip6->src_address,
1917  tcp->dst_port,
1918  tcp->src_port,
1919  SESSION_TYPE_IP6_TCP,
1920  thread_index);
1921  tc = tcp_get_connection_from_transport (tconn);
1922  ASSERT (tcp_lookup_is_valid (tc, tcp));
1923  }
1924  return tc;
1925 }
1926 
1929  vlib_frame_t * from_frame, int is_ip4)
1930 {
1931  tcp_main_t *tm = vnet_get_tcp_main ();
1932  u32 n_left_from, next_index, *from, *to_next;
1933  u32 my_thread_index = vm->thread_index, errors = 0;
1934 
1935  from = vlib_frame_vector_args (from_frame);
1936  n_left_from = from_frame->n_vectors;
1937 
1938  next_index = node->cached_next_index;
1939 
1940  while (n_left_from > 0)
1941  {
1942  u32 n_left_to_next;
1943 
1944  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1945 
1946  while (n_left_from > 0 && n_left_to_next > 0)
1947  {
1948  u32 bi0, ack0, seq0;
1949  vlib_buffer_t *b0;
1950  tcp_rx_trace_t *t0;
1951  tcp_header_t *tcp0 = 0;
1952  tcp_connection_t *tc0;
1953  tcp_connection_t *new_tc0;
1954  u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1955 
1956  bi0 = from[0];
1957  to_next[0] = bi0;
1958  from += 1;
1959  to_next += 1;
1960  n_left_from -= 1;
1961  n_left_to_next -= 1;
1962 
1963  b0 = vlib_get_buffer (vm, bi0);
1964  tc0 =
1966  tcp.connection_index);
1967  if (PREDICT_FALSE (tc0 == 0))
1968  {
1969  error0 = TCP_ERROR_INVALID_CONNECTION;
1970  goto drop;
1971  }
1972 
1973  /* Half-open completed recently but the connection was't removed
1974  * yet by the owning thread */
1975  if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
1976  {
1977  /* Make sure the connection actually exists */
1978  ASSERT (tcp_lookup_connection (b0, my_thread_index, is_ip4));
1979  goto drop;
1980  }
1981 
1982  ack0 = vnet_buffer (b0)->tcp.ack_number;
1983  seq0 = vnet_buffer (b0)->tcp.seq_number;
1984  tcp0 = tcp_buffer_hdr (b0);
1985 
1986  /* Crude check to see if the connection handle does not match
1987  * the packet. Probably connection just switched to established */
1988  if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
1989  || tcp0->src_port != tc0->c_rmt_port))
1990  goto drop;
1991 
1992  if (PREDICT_FALSE
1993  (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
1994  goto drop;
1995 
1996  /* SYNs, FINs and data consume sequence numbers */
1997  vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0)
1998  + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len;
1999 
2000  /*
2001  * 1. check the ACK bit
2002  */
2003 
2004  /*
2005  * If the ACK bit is set
2006  * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
2007  * the RST bit is set, if so drop the segment and return)
2008  * <SEQ=SEG.ACK><CTL=RST>
2009  * and discard the segment. Return.
2010  * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
2011  */
2012  if (tcp_ack (tcp0))
2013  {
2014  if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
2015  {
2016  clib_warning ("ack not in rcv wnd");
2017  if (!tcp_rst (tcp0))
2018  tcp_send_reset_w_pkt (tc0, b0, is_ip4);
2019  goto drop;
2020  }
2021 
2022  /* Make sure ACK is valid */
2023  if (seq_gt (tc0->snd_una, ack0))
2024  {
2025  clib_warning ("ack invalid");
2026  goto drop;
2027  }
2028  }
2029 
2030  /*
2031  * 2. check the RST bit
2032  */
2033 
2034  if (tcp_rst (tcp0))
2035  {
2036  /* If ACK is acceptable, signal client that peer is not
2037  * willing to accept connection and drop connection*/
2038  if (tcp_ack (tcp0))
2039  tcp_connection_reset (tc0);
2040  goto drop;
2041  }
2042 
2043  /*
2044  * 3. check the security and precedence (skipped)
2045  */
2046 
2047  /*
2048  * 4. check the SYN bit
2049  */
2050 
2051  /* No SYN flag. Drop. */
2052  if (!tcp_syn (tcp0))
2053  {
2054  clib_warning ("not synack");
2055  goto drop;
2056  }
2057 
2058  /* Parse options */
2059  if (tcp_options_parse (tcp0, &tc0->rcv_opts))
2060  {
2061  clib_warning ("options parse fail");
2062  goto drop;
2063  }
2064 
2065  /* Valid SYN or SYN-ACK. Move connection from half-open pool to
2066  * current thread pool. */
2067  pool_get (tm->connections[my_thread_index], new_tc0);
2068  clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
2069  new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
2070  new_tc0->c_thread_index = my_thread_index;
2071  new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
2072  new_tc0->irs = seq0;
2073  new_tc0->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
2074  new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] =
2076 
2077  /* If this is not the owning thread, wait for syn retransmit to
2078  * expire and cleanup then */
2080  tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
2081 
2082  if (tcp_opts_tstamp (&new_tc0->rcv_opts))
2083  {
2084  new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
2085  new_tc0->tsval_recent_age = tcp_time_now ();
2086  }
2087 
2088  if (tcp_opts_wscale (&new_tc0->rcv_opts))
2089  new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
2090 
2091  /* RFC1323: SYN and SYN-ACK wnd not scaled */
2092  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
2093  new_tc0->snd_wl1 = seq0;
2094  new_tc0->snd_wl2 = ack0;
2095 
2096  tcp_connection_init_vars (new_tc0);
2097 
2098  /* SYN-ACK: See if we can switch to ESTABLISHED state */
2099  if (PREDICT_TRUE (tcp_ack (tcp0)))
2100  {
2101  /* Our SYN is ACKed: we have iss < ack = snd_una */
2102 
2103  /* TODO Dequeue acknowledged segments if we support Fast Open */
2104  new_tc0->snd_una = ack0;
2105  new_tc0->state = TCP_STATE_ESTABLISHED;
2106 
2107  /* Make sure las is initialized for the wnd computation */
2108  new_tc0->rcv_las = new_tc0->rcv_nxt;
2109 
2110  /* Notify app that we have connection. If session layer can't
2111  * allocate session send reset */
2112  if (stream_session_connect_notify (&new_tc0->connection, 0))
2113  {
2114  clib_warning ("connect notify fail");
2115  tcp_send_reset_w_pkt (new_tc0, b0, is_ip4);
2116  tcp_connection_cleanup (new_tc0);
2117  goto drop;
2118  }
2119 
2120  /* Make sure after data segment processing ACK is sent */
2121  new_tc0->flags |= TCP_CONN_SNDACK;
2122 
2123  /* Update rtt with the syn-ack sample */
2124  tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
2125  TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
2126  }
2127  /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
2128  else
2129  {
2130  new_tc0->state = TCP_STATE_SYN_RCVD;
2131 
2132  /* Notify app that we have connection */
2133  if (stream_session_connect_notify (&new_tc0->connection, 0))
2134  {
2135  tcp_connection_cleanup (new_tc0);
2136  tcp_send_reset_w_pkt (tc0, b0, is_ip4);
2137  TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
2138  goto drop;
2139  }
2140 
2141  tc0->rtt_ts = 0;
2142  tcp_init_snd_vars (tc0);
2143  tcp_make_synack (new_tc0, b0);
2144  next0 = tcp_next_output (is_ip4);
2145 
2146  goto drop;
2147  }
2148 
2149  /* Read data, if any */
2150  if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
2151  {
2152  ASSERT (0);
2153  error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0);
2154  if (error0 == TCP_ERROR_PURE_ACK)
2155  error0 = TCP_ERROR_SYN_ACKS_RCVD;
2156  }
2157  else
2158  {
2159  tcp_make_ack (new_tc0, b0);
2160  next0 = tcp_next_output (new_tc0->c_is_ip4);
2161  }
2162 
2163  drop:
2164 
2165  b0->error = error0 ? node->errors[error0] : 0;
2166  if (PREDICT_FALSE
2167  ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0))
2168  {
2169  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2170  clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
2171  clib_memcpy (&t0->tcp_connection, tc0,
2172  sizeof (t0->tcp_connection));
2173  }
2174 
2175  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2176  n_left_to_next, bi0, next0);
2177  }
2178 
2179  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2180  }
2181 
2182  errors = session_manager_flush_enqueue_events (my_thread_index);
2183  if (errors)
2184  {
2185  if (is_ip4)
2187  TCP_ERROR_EVENT_FIFO_FULL, errors);
2188  else
2190  TCP_ERROR_EVENT_FIFO_FULL, errors);
2191  }
2192 
2193  return from_frame->n_vectors;
2194 }
2195 
2196 static uword
2198  vlib_frame_t * from_frame)
2199 {
2200  return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2201 }
2202 
2203 static uword
2205  vlib_frame_t * from_frame)
2206 {
2207  return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2208 }
2209 
2210 /* *INDENT-OFF* */
2212 {
2213  .function = tcp4_syn_sent,
2214  .name = "tcp4-syn-sent",
2215  /* Takes a vector of packets. */
2216  .vector_size = sizeof (u32),
2217  .n_errors = TCP_N_ERROR,
2218  .error_strings = tcp_error_strings,
2219  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2220  .next_nodes =
2221  {
2222 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2224 #undef _
2225  },
2226  .format_trace = format_tcp_rx_trace_short,
2227 };
2228 /* *INDENT-ON* */
2229 
2231 
2232 /* *INDENT-OFF* */
2234 {
2235  .function = tcp6_syn_sent_rcv,
2236  .name = "tcp6-syn-sent",
2237  /* Takes a vector of packets. */
2238  .vector_size = sizeof (u32),
2239  .n_errors = TCP_N_ERROR,
2240  .error_strings = tcp_error_strings,
2241  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2242  .next_nodes =
2243  {
2244 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2246 #undef _
2247  },
2248  .format_trace = format_tcp_rx_trace_short,
2249 };
2250 /* *INDENT-ON* */
2251 
2253 
2254 /**
2255  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
2256  * as per RFC793 p. 64
2257  */
2260  vlib_frame_t * from_frame, int is_ip4)
2261 {
2262  tcp_main_t *tm = vnet_get_tcp_main ();
2263  u32 n_left_from, next_index, *from, *to_next;
2264  u32 my_thread_index = vm->thread_index, errors = 0;
2265 
2266  from = vlib_frame_vector_args (from_frame);
2267  n_left_from = from_frame->n_vectors;
2268 
2269  next_index = node->cached_next_index;
2270 
2271  while (n_left_from > 0)
2272  {
2273  u32 n_left_to_next;
2274 
2275  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2276 
2277  while (n_left_from > 0 && n_left_to_next > 0)
2278  {
2279  u32 bi0;
2280  vlib_buffer_t *b0;
2281  tcp_header_t *tcp0 = 0;
2282  tcp_connection_t *tc0;
2283  u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
2284  u8 is_fin0;
2285 
2286  bi0 = from[0];
2287  to_next[0] = bi0;
2288  from += 1;
2289  to_next += 1;
2290  n_left_from -= 1;
2291  n_left_to_next -= 1;
2292 
2293  b0 = vlib_get_buffer (vm, bi0);
2294  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2295  my_thread_index);
2296  if (PREDICT_FALSE (tc0 == 0))
2297  {
2298  error0 = TCP_ERROR_INVALID_CONNECTION;
2299  goto drop;
2300  }
2301 
2302  tcp0 = tcp_buffer_hdr (b0);
2303  is_fin0 = tcp_is_fin (tcp0);
2304 
2305  /* SYNs, FINs and data consume sequence numbers */
2306  vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
2307  + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len;
2308 
2309  if (CLIB_DEBUG)
2310  {
2311  tcp_connection_t *tmp;
2312  tmp = tcp_lookup_connection (b0, my_thread_index, is_ip4);
2313  if (tmp->state != tc0->state)
2314  {
2315  clib_warning ("state changed");
2316  ASSERT (0);
2317  goto drop;
2318  }
2319  }
2320 
2321  /*
2322  * Special treatment for CLOSED
2323  */
2324  switch (tc0->state)
2325  {
2326  case TCP_STATE_CLOSED:
2327  goto drop;
2328  break;
2329  }
2330 
2331  /*
2332  * For all other states (except LISTEN)
2333  */
2334 
2335  /* 1-4: check SEQ, RST, SYN */
2336  if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, tcp0,
2337  &next0)))
2338  {
2339  error0 = TCP_ERROR_SEGMENT_INVALID;
2340  goto drop;
2341  }
2342 
2343  /* 5: check the ACK field */
2344  switch (tc0->state)
2345  {
2346  case TCP_STATE_SYN_RCVD:
2347  /*
2348  * If the segment acknowledgment is not acceptable, form a
2349  * reset segment,
2350  * <SEQ=SEG.ACK><CTL=RST>
2351  * and send it.
2352  */
2353  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
2354  {
2355  clib_warning ("connection not accepted");
2356  tcp_send_reset_w_pkt (tc0, b0, is_ip4);
2357  goto drop;
2358  }
2359 
2360  /* Update rtt and rto */
2361  tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
2362 
2363  /* Switch state to ESTABLISHED */
2364  tc0->state = TCP_STATE_ESTABLISHED;
2365 
2366  /* Initialize session variables */
2367  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
2368  tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2369  << tc0->rcv_opts.wscale;
2370  tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2371  tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2372  stream_session_accept_notify (&tc0->connection);
2373 
2374  /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
2376  tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
2377  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2378  break;
2379  case TCP_STATE_ESTABLISHED:
2380  /* We can get packets in established state here because they
2381  * were enqueued before state change */
2382  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2383  goto drop;
2384 
2385  break;
2386  case TCP_STATE_FIN_WAIT_1:
2387  /* In addition to the processing for the ESTABLISHED state, if
2388  * our FIN is now acknowledged then enter FIN-WAIT-2 and
2389  * continue processing in that state. */
2390  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2391  goto drop;
2392 
2393  /* Still have to send the FIN */
2394  if (tc0->flags & TCP_CONN_FINPNDG)
2395  {
2396  /* TX fifo finally drained */
2397  if (!stream_session_tx_fifo_max_dequeue (&tc0->connection))
2398  tcp_send_fin (tc0);
2399  }
2400  /* If FIN is ACKed */
2401  else if (tc0->snd_una == tc0->snd_una_max)
2402  {
2403  tc0->state = TCP_STATE_FIN_WAIT_2;
2404  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2405 
2406  /* Stop all retransmit timers because we have nothing more
2407  * to send. Enable waitclose though because we're willing to
2408  * wait for peer's FIN but not indefinitely. */
2410  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2411  }
2412  break;
2413  case TCP_STATE_FIN_WAIT_2:
2414  /* In addition to the processing for the ESTABLISHED state, if
2415  * the retransmission queue is empty, the user's CLOSE can be
2416  * acknowledged ("ok") but do not delete the TCB. */
2417  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2418  goto drop;
2419  break;
2420  case TCP_STATE_CLOSE_WAIT:
2421  /* Do the same processing as for the ESTABLISHED state. */
2422  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2423  goto drop;
2424  break;
2425  case TCP_STATE_CLOSING:
2426  /* In addition to the processing for the ESTABLISHED state, if
2427  * the ACK acknowledges our FIN then enter the TIME-WAIT state,
2428  * otherwise ignore the segment. */
2429  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2430  goto drop;
2431 
2432  tc0->state = TCP_STATE_TIME_WAIT;
2433  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2434  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2435  goto drop;
2436 
2437  break;
2438  case TCP_STATE_LAST_ACK:
2439  /* The only thing that [should] arrive in this state is an
2440  * acknowledgment of our FIN. If our FIN is now acknowledged,
2441  * delete the TCB, enter the CLOSED state, and return. */
2442 
2443  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
2444  goto drop;
2445 
2446  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
2447  /* Apparently our FIN was lost */
2448  if (is_fin0)
2449  {
2450  tcp_send_fin (tc0);
2451  goto drop;
2452  }
2453 
2454  tc0->state = TCP_STATE_CLOSED;
2455  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2456 
2457  /* Don't delete the connection/session yet. Instead, wait a
2458  * reasonable amount of time until the pipes are cleared. In
2459  * particular, this makes sure that we won't have dead sessions
2460  * when processing events on the tx path */
2461  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
2463 
2464  goto drop;
2465 
2466  break;
2467  case TCP_STATE_TIME_WAIT:
2468  /* The only thing that can arrive in this state is a
2469  * retransmission of the remote FIN. Acknowledge it, and restart
2470  * the 2 MSL timeout. */
2471 
2472  if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
2473  goto drop;
2474 
2475  tcp_make_ack (tc0, b0);
2476  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2477 
2478  goto drop;
2479 
2480  break;
2481  default:
2482  ASSERT (0);
2483  }
2484 
2485  /* 6: check the URG bit TODO */
2486 
2487  /* 7: process the segment text */
2488  switch (tc0->state)
2489  {
2490  case TCP_STATE_ESTABLISHED:
2491  case TCP_STATE_FIN_WAIT_1:
2492  case TCP_STATE_FIN_WAIT_2:
2493  if (vnet_buffer (b0)->tcp.data_len)
2494  error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
2495  else if (is_fin0)
2496  tc0->rcv_nxt += 1;
2497  break;
2498  case TCP_STATE_CLOSE_WAIT:
2499  case TCP_STATE_CLOSING:
2500  case TCP_STATE_LAST_ACK:
2501  case TCP_STATE_TIME_WAIT:
2502  /* This should not occur, since a FIN has been received from the
2503  * remote side. Ignore the segment text. */
2504  break;
2505  }
2506 
2507  /* 8: check the FIN bit */
2508  if (!is_fin0)
2509  goto drop;
2510 
2511  switch (tc0->state)
2512  {
2513  case TCP_STATE_ESTABLISHED:
2514  case TCP_STATE_SYN_RCVD:
2515  /* Send FIN-ACK notify app and enter CLOSE-WAIT */
2517  tcp_make_fin (tc0, b0);
2518  tc0->snd_nxt += 1;
2519  next0 = tcp_next_output (tc0->c_is_ip4);
2520  stream_session_disconnect_notify (&tc0->connection);
2521  tc0->state = TCP_STATE_CLOSE_WAIT;
2522  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2523  break;
2524  case TCP_STATE_CLOSE_WAIT:
2525  case TCP_STATE_CLOSING:
2526  case TCP_STATE_LAST_ACK:
2527  /* move along .. */
2528  break;
2529  case TCP_STATE_FIN_WAIT_1:
2530  tc0->state = TCP_STATE_CLOSING;
2531  tcp_make_ack (tc0, b0);
2532  next0 = tcp_next_output (is_ip4);
2533  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2534  /* Wait for ACK but not forever */
2535  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
2536  break;
2537  case TCP_STATE_FIN_WAIT_2:
2538  /* Got FIN, send ACK! Be more aggressive with resource cleanup */
2539  tc0->state = TCP_STATE_TIME_WAIT;
2541  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
2542  tcp_make_ack (tc0, b0);
2543  next0 = tcp_next_output (is_ip4);
2544  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
2545  break;
2546  case TCP_STATE_TIME_WAIT:
2547  /* Remain in the TIME-WAIT state. Restart the time-wait
2548  * timeout.
2549  */
2550  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
2551  break;
2552  }
2553  TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
2554 
2555  drop:
2556  b0->error = error0 ? node->errors[error0] : 0;
2557 
2559  {
2560  tcp_rx_trace_t *t0 =
2561  vlib_add_trace (vm, node, b0, sizeof (*t0));
2562  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
2563  }
2564 
2565  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2566  n_left_to_next, bi0, next0);
2567  }
2568 
2569  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2570  }
2571 
2572  errors = session_manager_flush_enqueue_events (my_thread_index);
2573  if (errors)
2574  {
2575  if (is_ip4)
2577  TCP_ERROR_EVENT_FIFO_FULL, errors);
2578  else
2580  TCP_ERROR_EVENT_FIFO_FULL, errors);
2581  }
2582 
2583  return from_frame->n_vectors;
2584 }
2585 
2586 static uword
2588  vlib_frame_t * from_frame)
2589 {
2590  return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2591 }
2592 
2593 static uword
2595  vlib_frame_t * from_frame)
2596 {
2597  return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2598 }
2599 
2600 /* *INDENT-OFF* */
2602 {
2603  .function = tcp4_rcv_process,
2604  .name = "tcp4-rcv-process",
2605  /* Takes a vector of packets. */
2606  .vector_size = sizeof (u32),
2607  .n_errors = TCP_N_ERROR,
2608  .error_strings = tcp_error_strings,
2609  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2610  .next_nodes =
2611  {
2612 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2614 #undef _
2615  },
2616  .format_trace = format_tcp_rx_trace_short,
2617 };
2618 /* *INDENT-ON* */
2619 
2621 
2622 /* *INDENT-OFF* */
2624 {
2625  .function = tcp6_rcv_process,
2626  .name = "tcp6-rcv-process",
2627  /* Takes a vector of packets. */
2628  .vector_size = sizeof (u32),
2629  .n_errors = TCP_N_ERROR,
2630  .error_strings = tcp_error_strings,
2631  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2632  .next_nodes =
2633  {
2634 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2636 #undef _
2637  },
2638  .format_trace = format_tcp_rx_trace_short,
2639 };
2640 /* *INDENT-ON* */
2641 
2643 
2646 
2647 /**
2648  * LISTEN state processing as per RFC 793 p. 65
2649  */
2652  vlib_frame_t * from_frame, int is_ip4)
2653 {
2654  u32 n_left_from, next_index, *from, *to_next;
2655  u32 my_thread_index = vm->thread_index;
2656  u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
2657 
2658  from = vlib_frame_vector_args (from_frame);
2659  n_left_from = from_frame->n_vectors;
2660 
2661  next_index = node->cached_next_index;
2662 
2663  while (n_left_from > 0)
2664  {
2665  u32 n_left_to_next;
2666 
2667  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2668 
2669  while (n_left_from > 0 && n_left_to_next > 0)
2670  {
2671  u32 bi0;
2672  vlib_buffer_t *b0;
2673  tcp_rx_trace_t *t0;
2674  tcp_header_t *th0 = 0;
2675  tcp_connection_t *lc0;
2676  ip4_header_t *ip40;
2677  ip6_header_t *ip60;
2678  tcp_connection_t *child0;
2679  u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP;
2680 
2681  bi0 = from[0];
2682  to_next[0] = bi0;
2683  from += 1;
2684  to_next += 1;
2685  n_left_from -= 1;
2686  n_left_to_next -= 1;
2687 
2688  b0 = vlib_get_buffer (vm, bi0);
2689  lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
2690 
2691  if (is_ip4)
2692  {
2693  ip40 = vlib_buffer_get_current (b0);
2694  th0 = ip4_next_header (ip40);
2695  }
2696  else
2697  {
2698  ip60 = vlib_buffer_get_current (b0);
2699  th0 = ip6_next_header (ip60);
2700  }
2701 
2702  /* Create child session. For syn-flood protection use filter */
2703 
2704  /* 1. first check for an RST: handled in dispatch */
2705  /* if (tcp_rst (th0))
2706  goto drop; */
2707 
2708  /* 2. second check for an ACK: handled in dispatch */
2709  /* if (tcp_ack (th0))
2710  {
2711  tcp_send_reset (b0, is_ip4);
2712  goto drop;
2713  } */
2714 
2715  /* 3. check for a SYN (did that already) */
2716 
2717  /* Make sure connection wasn't just created */
2718  child0 = tcp_lookup_connection (b0, my_thread_index, is_ip4);
2719  if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN))
2720  {
2721  error0 = TCP_ERROR_CREATE_EXISTS;
2722  goto drop;
2723  }
2724 
2725  /* Create child session and send SYN-ACK */
2726  child0 = tcp_connection_new (my_thread_index);
2727  child0->c_lcl_port = lc0->c_lcl_port;
2728  child0->c_rmt_port = th0->src_port;
2729  child0->c_is_ip4 = is_ip4;
2730  child0->state = TCP_STATE_SYN_RCVD;
2731 
2732  if (is_ip4)
2733  {
2734  child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
2735  child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
2736  }
2737  else
2738  {
2739  clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address,
2740  sizeof (ip6_address_t));
2741  clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address,
2742  sizeof (ip6_address_t));
2743  }
2744 
2745  if (stream_session_accept (&child0->connection, lc0->c_s_index, sst,
2746  0 /* notify */ ))
2747  {
2748  clib_warning ("session accept fail");
2749  tcp_connection_cleanup (child0);
2750  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2751  goto drop;
2752  }
2753 
2754  if (tcp_options_parse (th0, &child0->rcv_opts))
2755  {
2756  clib_warning ("options parse fail");
2757  goto drop;
2758  }
2759 
2760  child0->irs = vnet_buffer (b0)->tcp.seq_number;
2761  child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
2762  child0->rcv_las = child0->rcv_nxt;
2763 
2764  /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
2765  * segments are used to initialize PAWS. */
2766  if (tcp_opts_tstamp (&child0->rcv_opts))
2767  {
2768  child0->tsval_recent = child0->rcv_opts.tsval;
2769  child0->tsval_recent_age = tcp_time_now ();
2770  }
2771 
2772  if (tcp_opts_wscale (&child0->rcv_opts))
2773  child0->snd_wscale = child0->rcv_opts.wscale;
2774 
2775  child0->snd_wnd = clib_net_to_host_u16 (th0->window)
2776  << child0->snd_wscale;
2777  child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2778  child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2779 
2780  tcp_connection_init_vars (child0);
2781  TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1);
2782 
2783  /* Reuse buffer to make syn-ack and send */
2784  tcp_make_synack (child0, b0);
2785  next0 = tcp_next_output (is_ip4);
2786  tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME);
2787 
2788  drop:
2790  {
2791  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2792  clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
2793  clib_memcpy (&t0->tcp_connection, lc0,
2794  sizeof (t0->tcp_connection));
2795  }
2796 
2797  b0->error = node->errors[error0];
2798 
2799  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2800  n_left_to_next, bi0, next0);
2801  }
2802 
2803  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2804  }
2805  return from_frame->n_vectors;
2806 }
2807 
2808 static uword
2810  vlib_frame_t * from_frame)
2811 {
2812  return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2813 }
2814 
2815 static uword
2817  vlib_frame_t * from_frame)
2818 {
2819  return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2820 }
2821 
2822 /* *INDENT-OFF* */
2824 {
2825  .function = tcp4_listen,
2826  .name = "tcp4-listen",
2827  /* Takes a vector of packets. */
2828  .vector_size = sizeof (u32),
2829  .n_errors = TCP_N_ERROR,
2830  .error_strings = tcp_error_strings,
2831  .n_next_nodes = TCP_LISTEN_N_NEXT,
2832  .next_nodes =
2833  {
2834 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2836 #undef _
2837  },
2838  .format_trace = format_tcp_rx_trace_short,
2839 };
2840 /* *INDENT-ON* */
2841 
2843 
2844 /* *INDENT-OFF* */
2846 {
2847  .function = tcp6_listen,
2848  .name = "tcp6-listen",
2849  /* Takes a vector of packets. */
2850  .vector_size = sizeof (u32),
2851  .n_errors = TCP_N_ERROR,
2852  .error_strings = tcp_error_strings,
2853  .n_next_nodes = TCP_LISTEN_N_NEXT,
2854  .next_nodes =
2855  {
2856 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2858 #undef _
2859  },
2860  .format_trace = format_tcp_rx_trace_short,
2861 };
2862 /* *INDENT-ON* */
2863 
2865 
2868 
2869 typedef enum _tcp_input_next
2870 {
2880 
2881 #define foreach_tcp4_input_next \
2882  _ (DROP, "error-drop") \
2883  _ (LISTEN, "tcp4-listen") \
2884  _ (RCV_PROCESS, "tcp4-rcv-process") \
2885  _ (SYN_SENT, "tcp4-syn-sent") \
2886  _ (ESTABLISHED, "tcp4-established") \
2887  _ (RESET, "tcp4-reset") \
2888  _ (PUNT, "error-punt")
2889 
2890 #define foreach_tcp6_input_next \
2891  _ (DROP, "error-drop") \
2892  _ (LISTEN, "tcp6-listen") \
2893  _ (RCV_PROCESS, "tcp6-rcv-process") \
2894  _ (SYN_SENT, "tcp6-syn-sent") \
2895  _ (ESTABLISHED, "tcp6-established") \
2896  _ (RESET, "tcp6-reset") \
2897  _ (PUNT, "error-punt")
2898 
2899 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
2900 
2903  vlib_frame_t * from_frame, int is_ip4)
2904 {
2905  u32 n_left_from, next_index, *from, *to_next;
2906  u32 my_thread_index = vm->thread_index;
2907  tcp_main_t *tm = vnet_get_tcp_main ();
2908 
2909  from = vlib_frame_vector_args (from_frame);
2910  n_left_from = from_frame->n_vectors;
2911  next_index = node->cached_next_index;
2912  tcp_set_time_now (my_thread_index);
2913 
2914  while (n_left_from > 0)
2915  {
2916  u32 n_left_to_next;
2917 
2918  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2919 
2920  while (n_left_from > 0 && n_left_to_next > 0)
2921  {
2922  int n_advance_bytes0, n_data_bytes0;
2923  u32 bi0;
2924  vlib_buffer_t *b0;
2925  tcp_header_t *tcp0 = 0;
2926  tcp_connection_t *tc0;
2927  transport_connection_t *tconn;
2928  ip4_header_t *ip40;
2929  ip6_header_t *ip60;
2930  u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP;
2931  u8 flags0;
2932 
2933  bi0 = from[0];
2934  to_next[0] = bi0;
2935  from += 1;
2936  to_next += 1;
2937  n_left_from -= 1;
2938  n_left_to_next -= 1;
2939 
2940  b0 = vlib_get_buffer (vm, bi0);
2941  vnet_buffer (b0)->tcp.flags = 0;
2942 
2943  /* Checksum computed by ipx_local no need to compute again */
2944 
2945  if (is_ip4)
2946  {
2947  ip40 = vlib_buffer_get_current (b0);
2948  tcp0 = ip4_next_header (ip40);
2949  n_advance_bytes0 = (ip4_header_bytes (ip40)
2950  + tcp_header_bytes (tcp0));
2951  n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
2952  - n_advance_bytes0;
2954  &ip40->src_address,
2955  tcp0->dst_port,
2956  tcp0->src_port,
2957  SESSION_TYPE_IP4_TCP,
2958  my_thread_index);
2959  tc0 = tcp_get_connection_from_transport (tconn);
2960  ASSERT (tcp_lookup_is_valid (tc0, tcp0));
2961  }
2962  else
2963  {
2964  ip60 = vlib_buffer_get_current (b0);
2965  tcp0 = ip6_next_header (ip60);
2966  n_advance_bytes0 = tcp_header_bytes (tcp0);
2967  n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length)
2968  - n_advance_bytes0;
2969  n_advance_bytes0 += sizeof (ip60[0]);
2971  &ip60->src_address,
2972  tcp0->dst_port,
2973  tcp0->src_port,
2974  SESSION_TYPE_IP6_TCP,
2975  my_thread_index);
2976  tc0 = tcp_get_connection_from_transport (tconn);
2977  ASSERT (tcp_lookup_is_valid (tc0, tcp0));
2978  }
2979 
2980  /* Length check */
2981  if (PREDICT_FALSE (n_advance_bytes0 < 0))
2982  {
2983  error0 = TCP_ERROR_LENGTH;
2984  goto done;
2985  }
2986 
2987  /* Session exists */
2988  if (PREDICT_TRUE (0 != tc0))
2989  {
2990  /* Save connection index */
2991  vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index;
2992  vnet_buffer (b0)->tcp.seq_number =
2993  clib_net_to_host_u32 (tcp0->seq_number);
2994  vnet_buffer (b0)->tcp.ack_number =
2995  clib_net_to_host_u32 (tcp0->ack_number);
2996 
2997  vnet_buffer (b0)->tcp.hdr_offset = (u8 *) tcp0
2998  - (u8 *) vlib_buffer_get_current (b0);
2999  vnet_buffer (b0)->tcp.data_offset = n_advance_bytes0;
3000  vnet_buffer (b0)->tcp.data_len = n_data_bytes0;
3001 
3002  flags0 = tcp0->flags & filter_flags;
3003  next0 = tm->dispatch_table[tc0->state][flags0].next;
3004  error0 = tm->dispatch_table[tc0->state][flags0].error;
3005 
3006  if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH
3007  || next0 == TCP_INPUT_NEXT_RESET))
3008  {
3009  /* Overload tcp flags to store state */
3010  tcp_state_t state0 = tc0->state;
3011  vnet_buffer (b0)->tcp.flags = tc0->state;
3012 
3013  if (error0 == TCP_ERROR_DISPATCH)
3014  clib_warning ("disp error state %U flags %U",
3016  (int) flags0);
3017  }
3018  }
3019  else
3020  {
3021  if ((is_ip4 && tm->punt_unknown4) ||
3022  (!is_ip4 && tm->punt_unknown6))
3023  {
3024  next0 = TCP_INPUT_NEXT_PUNT;
3025  error0 = TCP_ERROR_PUNT;
3026  }
3027  else
3028  {
3029  /* Send reset */
3030  next0 = TCP_INPUT_NEXT_RESET;
3031  error0 = TCP_ERROR_NO_LISTENER;
3032  }
3033  }
3034 
3035  done:
3036  b0->error = error0 ? node->errors[error0] : 0;
3037 
3039  {
3040  tcp_rx_trace_t *t0 =
3041  vlib_add_trace (vm, node, b0, sizeof (*t0));
3042  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
3043  }
3044  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
3045  n_left_to_next, bi0, next0);
3046  }
3047 
3048  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
3049  }
3050 
3051  return from_frame->n_vectors;
3052 }
3053 
3054 static uword
3056  vlib_frame_t * from_frame)
3057 {
3058  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ );
3059 }
3060 
3061 static uword
3063  vlib_frame_t * from_frame)
3064 {
3065  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ );
3066 }
3067 
3068 /* *INDENT-OFF* */
3070 {
3071  .function = tcp4_input,
3072  .name = "tcp4-input",
3073  /* Takes a vector of packets. */
3074  .vector_size = sizeof (u32),
3075  .n_errors = TCP_N_ERROR,
3076  .error_strings = tcp_error_strings,
3077  .n_next_nodes = TCP_INPUT_N_NEXT,
3078  .next_nodes =
3079  {
3080 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3082 #undef _
3083  },
3084  .format_buffer = format_tcp_header,
3085  .format_trace = format_tcp_rx_trace,
3086 };
3087 /* *INDENT-ON* */
3088 
3090 
3091 /* *INDENT-OFF* */
3093 {
3094  .function = tcp6_input,
3095  .name = "tcp6-input",
3096  /* Takes a vector of packets. */
3097  .vector_size = sizeof (u32),
3098  .n_errors = TCP_N_ERROR,
3099  .error_strings = tcp_error_strings,
3100  .n_next_nodes = TCP_INPUT_N_NEXT,
3101  .next_nodes =
3102  {
3103 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3105 #undef _
3106  },
3107  .format_buffer = format_tcp_header,
3108  .format_trace = format_tcp_rx_trace,
3109 };
3110 /* *INDENT-ON* */
3111 
3113 
3114 static void
3116 {
3117  int i, j;
3118  for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
3119  for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
3120  {
3121  tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
3122  tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
3123  }
3124 
3125 #define _(t,f,n,e) \
3126 do { \
3127  tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
3128  tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
3129 } while (0)
3130 
3131  /* SYNs for new connections -> tcp-listen. */
3132  _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
3133  _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
3134  _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE);
3136  TCP_ERROR_NONE);
3137  /* ACK for for a SYN-ACK -> tcp-rcv-process. */
3138  _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3139  _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3140  _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3141  /* SYN-ACK for a SYN */
3143  TCP_ERROR_NONE);
3144  _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3145  _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3147  TCP_ERROR_NONE);
3148  /* ACK for for established connection -> tcp-established. */
3149  _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3150  /* FIN for for established connection -> tcp-established. */
3151  _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3153  TCP_ERROR_NONE);
3154  _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3156  TCP_ERROR_NONE);
3157  _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3159  TCP_ERROR_NONE);
3160  /* ACK or FIN-ACK to our FIN */
3161  _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3163  TCP_ERROR_NONE);
3164  /* FIN in reply to our FIN from the other side */
3165  _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3166  _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3167  /* FIN confirming that the peer (app) has closed */
3168  _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3169  _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3171  TCP_ERROR_NONE);
3172  _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3174  TCP_ERROR_NONE);
3175  _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3176  _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3178  TCP_ERROR_NONE);
3179  _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3180  _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3182  TCP_ERROR_NONE);
3183  _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3184  _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3185  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
3186  _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
3188  TCP_ERROR_CONNECTION_CLOSED);
3189 #undef _
3190 }
3191 
3192 clib_error_t *
3194 {
3195  clib_error_t *error = 0;
3196  tcp_main_t *tm = vnet_get_tcp_main ();
3197 
3198  if ((error = vlib_call_init_function (vm, tcp_init)))
3199  return error;
3200 
3201  /* Initialize dispatch table. */
3203 
3204  return error;
3205 }
3206 
3208 
3209 /*
3210  * fd.io coding-style-patch-verification: ON
3211  *
3212  * Local Variables:
3213  * eval: (c-set-style "gnu")
3214  * End:
3215  */
#define tcp_in_cong_recovery(tc)
Definition: tcp.h:330
static int tcp_session_enqueue_ooo(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue out-of-order data.
Definition: tcp_input.c:1438
static void tcp_update_timestamp(tcp_connection_t *tc, u32 seq, u32 seq_end)
Update tsval recent.
Definition: tcp_input.c:242
int session_manager_flush_enqueue_events(u32 thread_index)
Flushes queue of sessions that are to be notified of new data enqueued events.
Definition: session.c:392
static u8 tcp_should_fastrecover(tcp_connection_t *tc)
Definition: tcp_input.c:1030
#define TCP_2MSL_TIME
Definition: tcp.h:101
End of options.
Definition: tcp_packet.h:104
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:337
#define tcp_fastrecovery_1_smss_off(tc)
Definition: tcp.h:328
static int tcp_segment_rcv(tcp_main_t *tm, tcp_connection_t *tc, vlib_buffer_t *b, u32 *next0)
Definition: tcp_input.c:1539
#define clib_min(x, y)
Definition: clib.h:332
#define CLIB_UNUSED(x)
Definition: clib.h:79
static void tcp_cc_update(tcp_connection_t *tc, vlib_buffer_t *b)
Definition: tcp_input.c:1002
vlib_node_registration_t tcp6_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp6_rcv_process_node)
Definition: tcp_input.c:2623
#define tcp_in_recovery(tc)
Definition: tcp.h:324
#define TCP_OPTION_LEN_SACK_PERMITTED
Definition: tcp_packet.h:167
static int tcp_rcv_ack_is_acceptable(tcp_connection_t *tc0, vlib_buffer_t *tb0)
Definition: tcp_input.c:375
#define seq_leq(_s1, _s2)
Definition: tcp.h:533
void tcp_make_fin(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to FIN-ACK.
Definition: tcp_output.c:556
struct _sack_block sack_block_t
int stream_session_accept(transport_connection_t *tc, u32 listener_index, u8 sst, u8 notify)
Accept a stream session.
Definition: session.c:594
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
Definition: tcp_input.c:720
#define timestamp_leq(_t1, _t2)
Definition: tcp.h:540
ip4_address_t src_address
Definition: ip4_packet.h:164
static u8 tcp_cc_is_spurious_retransmit(tcp_connection_t *tc)
Definition: tcp_input.c:972
static uword tcp46_input_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:2902
enum _tcp_state_next tcp_state_next_t
static tcp_connection_t * tcp_lookup_connection(vlib_buffer_t *b, u8 thread_index, u8 is_ip4)
Lookup transport connection.
Definition: tcp_input.c:1891
struct _transport_connection transport_connection_t
#define tcp_rst(_th)
Definition: tcp_packet.h:81
#define TCP_TIMEWAIT_TIME
Definition: tcp.h:103
static uword tcp6_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:3062
Selective Ack permitted.
Definition: tcp_packet.h:108
#define TCP_FLAG_SYN
Definition: fa_node.h:8
#define tcp_opts_tstamp(_to)
Definition: tcp_packet.h:157
#define PREDICT_TRUE(x)
Definition: clib.h:98
void tcp_fast_retransmit(tcp_connection_t *tc)
Do fast retransmit.
Definition: tcp_output.c:1721
static int tcp_segment_validate(vlib_main_t *vm, tcp_connection_t *tc0, vlib_buffer_t *b0, tcp_header_t *th0, u32 *next0)
Validate incoming segment as per RFC793 p.
Definition: tcp_input.c:270
tcp_connection_t * tcp_connection_new(u8 thread_index)
Definition: tcp.c:230
static void tcp_dispatch_table_init(tcp_main_t *tm)
Definition: tcp_input.c:3115
int stream_session_enqueue_data(transport_connection_t *tc, vlib_buffer_t *b, u32 offset, u8 queue_event, u8 is_in_order)
Definition: session.c:220
static int ip4_header_bytes(ip4_header_t *i)
Definition: ip4_packet.h:227
struct _sack_scoreboard sack_scoreboard_t
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp.h:501
void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_input.c:415
void scoreboard_update_bytes(tcp_connection_t *tc, sack_scoreboard_t *sb)
Definition: tcp_input.c:594
#define tcp_doff(_th)
Definition: tcp_packet.h:78
struct _tcp_main tcp_main_t
u32 thread_index
Definition: main.h:173
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
Definition: tcp.c:459
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:518
#define tcp_recovery_off(tc)
Definition: tcp.h:322
#define clib_abs(x)
Definition: clib.h:339
struct _vlib_node_registration vlib_node_registration_t
static sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp.h:832
static int tcp_update_rtt(tcp_connection_t *tc, u32 ack)
Update RTT estimate and RTO timer.
Definition: tcp_input.c:433
#define vec_add2(V, P, N)
Add N elements to end of vector V, return pointer to new elements in P.
Definition: vec.h:557
vlib_node_registration_t tcp4_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp4_rcv_process_node)
Definition: tcp_input.c:2601
struct _tcp_connection tcp_connection_t
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:419
#define tcp_opts_sack(_to)
Definition: tcp_packet.h:159
void tcp_fast_retransmit_sack(tcp_connection_t *tc)
Do fast retransmit with SACKs.
Definition: tcp_output.c:1607
tcp_connection_t tcp_connection
Definition: tcp_input.c:1622
static u8 tcp_sack_vector_is_sane(sack_block_t *sacks)
Definition: tcp_input.c:1300
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
Definition: tcp.h:470
static void tcp_cc_congestion_undo(tcp_connection_t *tc)
Definition: tcp_input.c:958
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:821
vlib_error_t * errors
Vector of errors for this node.
Definition: node.h:415
No operation.
Definition: tcp_packet.h:105
format_function_t format_tcp_flags
Definition: tcp.h:64
#define pool_get(P, E)
Allocate an object E from a pool P (unspecified alignment).
Definition: pool.h:225
u8 n_sack_blocks
Number of SACKs blocks.
Definition: tcp_packet.h:152
struct _tcp_header tcp_header_t
static u32 tcp_available_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
Definition: tcp.h:621
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
Definition: tcp.c:147
static uword tcp6_listen(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2816
ip6_address_t src_address
Definition: ip6_packet.h:341
vlib_node_registration_t tcp6_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp6_syn_sent_node)
Definition: tcp_input.c:1857
struct _sack_scoreboard_hole sack_scoreboard_hole_t
u8 wscale
Window scale advertised.
Definition: tcp_packet.h:148
static void tcp_established_inc_counter(vlib_main_t *vm, u8 is_ip4, u8 evt, u8 val)
Definition: tcp_input.c:1672
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
static void tcp_dequeue_acked(tcp_connection_t *tc, u32 ack)
Dequeue bytes that have been acked and while at it update RTT estimates.
Definition: tcp_input.c:477
#define tcp_fastrecovery_on(tc)
Definition: tcp.h:319
void tcp_flush_frame_to_output(vlib_main_t *vm, u8 thread_index, u8 is_ip4)
Flush tx frame populated by retransmits and timer pops.
Definition: tcp_output.c:1004
Limit MSS.
Definition: tcp_packet.h:106
static uword tcp4_listen(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2809
#define VLIB_BUFFER_NEXT_PRESENT
Definition: buffer.h:95
static u32 scoreboard_hole_index(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp.h:882
#define tcp_is_fin(_th)
Definition: tcp_packet.h:90
static uword tcp6_rcv_process(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2594
static uword tcp4_syn_sent(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2197
#define seq_gt(_s1, _s2)
Definition: tcp.h:534
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
Definition: tcp.c:569
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:111
vlib_node_registration_t tcp4_established_node
(constructor) VLIB_REGISTER_NODE (tcp4_established_node)
Definition: tcp_input.c:78
#define TCP_CLOSEWAIT_TIME
Definition: tcp.h:102
void stream_session_accept_notify(transport_connection_t *tc)
Definition: session.c:506
#define always_inline
Definition: clib.h:84
static uword format_get_indent(u8 *s)
Definition: format.h:72
#define TCP_OPTION_LEN_SACK_BLOCK
Definition: tcp_packet.h:169
ip4_address_t dst_address
Definition: ip4_packet.h:164
#define TCP_FLAG_ACK
Definition: fa_node.h:11
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:113
#define TCP_DELACK_TIME
Definition: tcp.h:98
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp.h:439
static void tcp_cc_recovery_exit(tcp_connection_t *tc)
Definition: tcp_input.c:934
enum _tcp_state tcp_state_t
#define TCP_ALWAYS_ACK
On/off delayed acks.
Definition: tcp.h:38
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
Definition: tcp_input.c:2867
static u8 tcp_ack_is_dupack(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una)
Check if duplicate ack as per RFC5681 Sec.
Definition: tcp_input.c:497
#define TCP_RTO_MAX
Definition: tcp.h:107
static u32 ooo_segment_length(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:199
static void * ip4_next_header(ip4_header_t *i)
Definition: ip4_packet.h:233
transport_connection_t * stream_session_half_open_lookup(ip46_address_t *lcl, ip46_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto)
static u32 tcp_time_now(void)
Definition: tcp.h:658
sack_block_t * sacks
SACK blocks.
Definition: tcp_packet.h:151
#define vec_end(v)
End (last data address) of vector.
static tcp_cc_algorithm_t * tcp_cc_algo_get(tcp_cc_algorithm_type_e type)
Definition: tcp.h:909
static u32 scoreboard_hole_bytes(sack_scoreboard_hole_t *hole)
Definition: tcp.h:876
struct _stream_session_t stream_session_t
#define vlib_call_init_function(vm, x)
Definition: init.h:162
#define TCP_MAX_SACK_BLOCKS
Max number of SACK blocks stored.
Definition: tcp.h:162
#define tcp_validate_txf_size(_tc, _a)
Definition: tcp.h:797
#define TCP_EVT_DBG(_evt, _args...)
Definition: tcp_debug.h:234
#define timestamp_lt(_t1, _t2)
Definition: tcp.h:539
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:700
#define TCP_OPTION_LEN_WINDOW_SCALE
Definition: tcp_packet.h:166
static void svm_fifo_newest_ooo_segment_reset(svm_fifo_t *f)
Definition: svm_fifo.h:165
static heap_elt_t * first(heap_header_t *h)
Definition: heap.c:59
u32 stream_session_dequeue_drop(transport_connection_t *tc, u32 max_bytes)
Definition: session.c:307
#define TCP_INVALID_SACK_HOLE_INDEX
Definition: tcp.h:163
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
Definition: pool.h:458
void tcp_cc_init(tcp_connection_t *tc)
Definition: tcp_input.c:1200
u8 * format_tcp_rx_trace(u8 *s, va_list *args)
Definition: tcp_input.c:1626
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:72
void tcp_cc_fastrecovery_exit(tcp_connection_t *tc)
Definition: tcp_input.c:946
static uword tcp46_listen_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
LISTEN state processing as per RFC 793 p.
Definition: tcp_input.c:2651
#define tcp_in_fastrecovery(tc)
Definition: tcp.h:323
void tcp_retransmit_first_unacked(tcp_connection_t *tc)
Retransmit first unacked segment.
Definition: tcp_output.c:1584
#define foreach_tcp4_input_next
Definition: tcp_input.c:2881
static sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp.h:824
static u32 ooo_segment_offset(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:187
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:193
#define filter_flags
Definition: tcp_input.c:2899
#define pool_put(P, E)
Free an object E in pool P.
Definition: pool.h:270
static int tcp_buffer_discard_bytes(vlib_buffer_t *b, u32 n_bytes_to_drop)
Definition: tcp_input.c:1511
#define foreach_tcp6_input_next
Definition: tcp_input.c:2890
#define TCP_TIMER_HANDLE_INVALID
Definition: tcp.h:93
void tcp_fast_retransmit_no_sack(tcp_connection_t *tc)
Fast retransmit without SACK info.
Definition: tcp_output.c:1684
#define TCP_CLEANUP_TIME
Definition: tcp.h:104
#define PREDICT_FALSE(x)
Definition: clib.h:97
transport_connection_t * stream_session_lookup_transport_wt6(ip6_address_t *lcl, ip6_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 my_thread_index)
#define vec_del1(v, i)
Delete the element at index I.
Definition: vec.h:801
int tcp_options_parse(tcp_header_t *th, tcp_options_t *to)
Parse TCP header options.
Definition: tcp_input.c:123
#define TCP_FLAG_FIN
Definition: fa_node.h:7
#define vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, bi0, next0)
Finish enqueueing one buffer forward in the graph.
Definition: buffer_node.h:216
static sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
Definition: tcp.h:848
#define vlib_get_next_frame(vm, node, next_index, vectors, n_vectors_left)
Get pointer to next frame vector data by (vlib_node_runtime_t, next_index).
Definition: node_funcs.h:364
vlib_node_registration_t tcp4_listen_node
(constructor) VLIB_REGISTER_NODE (tcp4_listen_node)
Definition: tcp_input.c:2644
#define TCP_OPTION_LEN_TIMESTAMP
Definition: tcp_packet.h:168
static u8 tcp_lookup_is_valid(tcp_connection_t *tc, tcp_header_t *hdr)
Definition: tcp_input.c:1860
static ooo_segment_t * svm_fifo_newest_ooo_segment(svm_fifo_t *f)
Definition: svm_fifo.h:157
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:113
Selective Ack block.
Definition: tcp_packet.h:109
vlib_node_registration_t tcp6_established_node
(constructor) VLIB_REGISTER_NODE (tcp6_established_node)
Definition: tcp_input.c:79
static int tcp_can_delack(tcp_connection_t *tc)
Check if ACK could be delayed.
Definition: tcp_input.c:1495
static void vlib_node_increment_counter(vlib_main_t *vm, u32 node_index, u32 counter_index, u64 increment)
Definition: node_funcs.h:1158
#define TCP_FLAG_RST
Definition: fa_node.h:9
#define TCP_MAX_WND_SCALE
Definition: tcp_packet.h:173
static void tcp_timer_reset(tcp_connection_t *tc, u8 timer_id)
Definition: tcp.h:710
static sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
Definition: tcp.h:840
static uword tcp6_syn_sent_rcv(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2204
vlib_node_registration_t tcp4_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp4_syn_sent_node)
Definition: tcp_input.c:1856
u16 n_vectors
Definition: node.h:344
vlib_main_t * vm
Definition: buffer.c:283
int stream_session_connect_notify(transport_connection_t *tc, u8 is_fail)
Definition: session.c:441
u32 stream_session_tx_fifo_max_dequeue(transport_connection_t *tc)
Definition: session.c:290
#define vec_free(V)
Free vector&#39;s memory (no header).
Definition: vec.h:336
#define TCP_DUPACK_THRESHOLD
Definition: tcp.h:34
format_function_t format_tcp_state
Definition: tcp.h:63
#define clib_warning(format, args...)
Definition: error.h:59
#define VLIB_BUFFER_IS_TRACED
Definition: buffer.h:93
#define clib_memcpy(a, b, c)
Definition: string.h:69
static int tcp_rcv_ack(tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *th, u32 *next, u32 *error)
Process incoming ACK.
Definition: tcp_input.c:1210
tcp_header_t tcp_header
Definition: tcp_input.c:1621
format_function_t format_tcp_header
Definition: format.h:102
void tcp_make_synack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to SYN-ACK.
Definition: tcp_output.c:602
#define ARRAY_LEN(x)
Definition: clib.h:59
void vlib_put_next_frame(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, u32 n_vectors_left)
Release pointer to next frame vector data.
Definition: main.c:454
#define TCP_RTT_MAX
Definition: tcp.h:109
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_sent_1_smss, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
Definition: tcp_input.c:638
u16 mss
Option flags, see above.
Definition: tcp_packet.h:147
static void * ip6_next_header(ip6_header_t *i)
Definition: ip6_packet.h:351
void tcp_make_ack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to ACK.
Definition: tcp_output.c:541
void stream_session_disconnect_notify(transport_connection_t *tc)
Notification from transport that connection is being closed.
Definition: session.c:524
u8 tcp_scoreboard_is_sane_post_recovery(tcp_connection_t *tc)
Test that scoreboard is sane after recovery.
Definition: tcp_input.c:712
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:722
#define TCP_PAWS_IDLE
24 days
Definition: tcp.h:30
u16 cached_next_index
Next frame index that vector arguments were last enqueued to last time this node ran.
Definition: node.h:456
clib_error_t * tcp_input_init(vlib_main_t *vm)
Definition: tcp_input.c:3193
#define ASSERT(truth)
#define tcp_syn(_th)
Definition: tcp_packet.h:80
unsigned int u32
Definition: types.h:88
static void tcp_estimate_rtt(tcp_connection_t *tc, u32 mrtt)
Compute smoothed RTT as per VJ&#39;s &#39;88 SIGCOMM and RFC6298.
Definition: tcp_input.c:392
enum _tcp_rcv_process_next tcp_rcv_process_next_t
static uword tcp4_established(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:1798
#define seq_geq(_s1, _s2)
Definition: tcp.h:535
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:109
static uword tcp46_established_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:1684
static void vlib_buffer_advance(vlib_buffer_t *b, word l)
Advance current data pointer by the supplied (signed!) amount.
Definition: buffer.h:206
static int tcp_segment_check_paws(tcp_connection_t *tc)
RFC1323: Check against wrapped sequence numbers (PAWS).
Definition: tcp_input.c:232
static void tcp_cc_handle_event(tcp_connection_t *tc, u32 is_dack)
One function to rule them all ...
Definition: tcp_input.c:1040
transport_connection_t * stream_session_lookup_transport_wt4(ip4_address_t *lcl, ip4_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 my_thread_index)
enum _tcp_input_next tcp_input_next_t
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
Definition: tcp_input.c:1323
Out-of-order segment.
Definition: svm_fifo.h:27
static u8 tcp_segment_in_rcv_wnd(tcp_connection_t *tc, u32 seq, u32 end_seq)
Validate segment sequence number.
Definition: tcp_input.c:109
#define clib_max(x, y)
Definition: clib.h:325
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
u64 uword
Definition: types.h:112
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
VLIB_NODE_FUNCTION_MULTIARCH(tcp4_established_node, tcp4_established)
void scoreboard_init_high_rxt(sack_scoreboard_t *sb, u32 seq)
Definition: tcp_input.c:693
static uword tcp6_established(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:1805
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:141
#define seq_lt(_s1, _s2)
Definition: tcp.h:532
#define tcp_is_syn(_th)
Definition: tcp_packet.h:89
#define tcp_opts_wscale(_to)
Definition: tcp_packet.h:158
enum _tcp_syn_sent_next tcp_syn_sent_next_t
static sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
Definition: tcp.h:816
static void tcp_update_snd_wnd(tcp_connection_t *tc, u32 seq, u32 ack, u32 snd_wnd)
Try to update snd_wnd based on feedback received from peer.
Definition: tcp_input.c:893
unsigned short u16
Definition: types.h:57
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
Definition: tcp.c:247
u32 tsval
Timestamp value.
Definition: tcp_packet.h:149
enum _tcp_established_next tcp_established_next_t
u16 payload_length
Definition: ip6_packet.h:332
sack_scoreboard_hole_t * scoreboard_insert_hole(sack_scoreboard_t *sb, u32 prev_index, u32 start, u32 end)
Definition: tcp_input.c:557
u32 tsecr
Echoed/reflected time stamp.
Definition: tcp_packet.h:150
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
Definition: tcp_input.c:2866
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:1050
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
unsigned char u8
Definition: types.h:56
enum _tcp_listen_next tcp_listen_next_t
#define foreach_tcp_state_next
Definition: tcp_input.c:29
static u32 tcp_set_time_now(u32 thread_index)
Definition: tcp.h:664
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
Definition: tcp.h:633
static uword tcp4_rcv_process(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:2587
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
Definition: tcp.h:778
static int tcp_session_enqueue_data(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue data for delivery to application.
Definition: tcp_input.c:1378
static u8 tcp_should_fastrecover_sack(tcp_connection_t *tc)
Definition: tcp_input.c:1024
#define seq_max(_s1, _s2)
Definition: tcp.h:536
static void * vlib_frame_vector_args(vlib_frame_t *f)
Get pointer to frame vector data.
Definition: node_funcs.h:267
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
Definition: tcp.c:593
#define TCP_OPTION_LEN_MSS
Definition: tcp_packet.h:165
#define tcp_next_output(is_ip4)
Definition: tcp_input.c:75
clib_error_t * tcp_init(vlib_main_t *vm)
Definition: tcp.c:1420
#define TCP_RTO_MIN
Definition: tcp.h:108
struct clib_bihash_value offset
template key/value backing page structure
#define tcp_scoreboard_trace_add(_tc, _ack)
Definition: tcp.h:226
u8 * format_tcp_connection(u8 *s, va_list *args)
Definition: tcp.c:865
void tcp_set_rx_trace_data(tcp_rx_trace_t *t0, tcp_connection_t *tc0, tcp_header_t *th0, vlib_buffer_t *b0, u8 is_ip4)
Definition: tcp_input.c:1657
#define vnet_buffer(b)
Definition: buffer.h:306
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp.h:451
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:143
static int tcp_header_bytes(tcp_header_t *t)
Definition: tcp_packet.h:93
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
Definition: tcp.c:176
static uword tcp4_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
Definition: tcp_input.c:3055
Window scale.
Definition: tcp_packet.h:107
vlib_node_registration_t tcp6_listen_node
(constructor) VLIB_REGISTER_NODE (tcp6_listen_node)
Definition: tcp_input.c:2645
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:160
int tcp_cc_recover(tcp_connection_t *tc)
Definition: tcp_input.c:981
Timestamps.
Definition: tcp_packet.h:110
void scoreboard_remove_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
Definition: tcp_input.c:522
#define TCP_SYN_RCVD_TIME
Definition: tcp.h:100
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:75
static void tcp_persist_timer_set(tcp_connection_t *tc)
Definition: tcp.h:755
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:433
static uword tcp46_syn_sent_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:1928
#define tcp_fastrecovery_off(tc)
Definition: tcp.h:320
static uword tcp46_rcv_process_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED as per RFC793 p...
Definition: tcp_input.c:2259
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:742
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:57
u8 * format_tcp_rx_trace_short(u8 *s, va_list *args)
Definition: tcp_input.c:1642
#define tcp_ack(_th)
Definition: tcp_packet.h:83
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp.h:792
static tcp_connection_t * tcp_listener_get(u32 tli)
Definition: tcp.h:495
ip6_address_t dst_address
Definition: ip6_packet.h:341
static u8 tcp_ack_is_cc_event(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una, u8 *is_dack)
Checks if ack is a congestion control event.
Definition: tcp_input.c:510
static stream_session_t * stream_session_get(u32 si, u32 thread_index)
Definition: session.h:217
void tcp_cc_init_congestion(tcp_connection_t *tc)
Definition: tcp_input.c:925
static void tcp_persist_timer_reset(tcp_connection_t *tc)
Definition: tcp.h:772
static char * tcp_error_strings[]
Definition: tcp_input.c:22