1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 30 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp 31 */ 32 33 /* 34 * Changes and additions relating to SLiRP 35 * Copyright (c) 1995 Danny Gasparovski. 36 * 37 * Please read the file COPYRIGHT for the 38 * terms and conditions of the copyright. 39 */ 40 41 #include <slirp.h> 42 #include "ip_icmp.h" 43 44 struct socket tcb; 45 46 #define TCPREXMTTHRESH 3 47 struct socket *tcp_last_so = &tcb; 48 49 tcp_seq tcp_iss; /* tcp initial send seq # */ 50 51 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 52 53 /* for modulo comparisons of timestamps */ 54 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 55 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 56 57 /* 58 * Insert segment ti into reassembly queue of tcp with 59 * control block tp. Return TH_FIN if reassembly now includes 60 * a segment with FIN. The macro form does the common case inline 61 * (segment is the next to be received on an established connection, 62 * and the queue is empty), avoiding linkage into and removal 63 * from the queue and repetition of various conversions. 64 * Set DELACK for segments received in order, but ack immediately 65 * when segments are out of order (so fast retransmit can work). 66 */ 67 #ifdef TCP_ACK_HACK 68 #define TCP_REASS(tp, ti, m, so, flags) {\ 69 if ((ti)->ti_seq == (tp)->rcv_nxt && \ 70 tcpfrag_list_empty(tp) && \ 71 (tp)->t_state == TCPS_ESTABLISHED) {\ 72 if (ti->ti_flags & TH_PUSH) \ 73 tp->t_flags |= TF_ACKNOW; \ 74 else \ 75 tp->t_flags |= TF_DELACK; \ 76 (tp)->rcv_nxt += (ti)->ti_len; \ 77 flags = (ti)->ti_flags & TH_FIN; \ 78 STAT(tcpstat.tcps_rcvpack++); \ 79 STAT(tcpstat.tcps_rcvbyte += (ti)->ti_len); \ 80 if (so->so_emu) { \ 81 if (tcp_emu((so),(m))) sbappend((so), (m)); \ 82 } else \ 83 sbappend((so), (m)); \ 84 /* sorwakeup(so); */ \ 85 } else {\ 86 (flags) = tcp_reass((tp), (ti), (m)); \ 87 tp->t_flags |= TF_ACKNOW; \ 88 } \ 89 } 90 #else 91 #define TCP_REASS(tp, ti, m, so, flags) { \ 92 if ((ti)->ti_seq == (tp)->rcv_nxt && \ 93 tcpfrag_list_empty(tp) && \ 94 (tp)->t_state == TCPS_ESTABLISHED) { \ 95 tp->t_flags |= TF_DELACK; \ 96 (tp)->rcv_nxt += (ti)->ti_len; \ 97 flags = (ti)->ti_flags & TH_FIN; \ 98 STAT(tcpstat.tcps_rcvpack++); \ 99 STAT(tcpstat.tcps_rcvbyte += (ti)->ti_len); \ 100 if (so->so_emu) { \ 101 if (tcp_emu((so),(m))) sbappend(so, (m)); \ 102 } else \ 103 sbappend((so), (m)); \ 104 /* sorwakeup(so); */ \ 105 } else { \ 106 (flags) = tcp_reass((tp), (ti), (m)); \ 107 tp->t_flags |= TF_ACKNOW; \ 108 } \ 109 } 110 #endif 111 static void tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, 112 struct tcpiphdr *ti); 113 static void tcp_xmit_timer(register struct tcpcb *tp, int rtt); 114 115 static int 116 tcp_reass(register struct tcpcb *tp, register struct tcpiphdr *ti, 117 struct mbuf *m) 118 { 119 register struct tcpiphdr *q; 120 struct socket *so = tp->t_socket; 121 int flags; 122 123 /* 124 * Call with ti==NULL after become established to 125 * force pre-ESTABLISHED data up to user socket. 126 */ 127 if (ti == NULL) 128 goto present; 129 130 /* 131 * Find a segment which begins after this one does. 132 */ 133 for (q = tcpfrag_list_first(tp); !tcpfrag_list_end(q, tp); 134 q = tcpiphdr_next(q)) 135 if (SEQ_GT(q->ti_seq, ti->ti_seq)) 136 break; 137 138 /* 139 * If there is a preceding segment, it may provide some of 140 * our data already. If so, drop the data from the incoming 141 * segment. If it provides all of our data, drop us. 142 */ 143 if (!tcpfrag_list_end(tcpiphdr_prev(q), tp)) { 144 register int i; 145 q = tcpiphdr_prev(q); 146 /* conversion to int (in i) handles seq wraparound */ 147 i = q->ti_seq + q->ti_len - ti->ti_seq; 148 if (i > 0) { 149 if (i >= ti->ti_len) { 150 STAT(tcpstat.tcps_rcvduppack++); 151 STAT(tcpstat.tcps_rcvdupbyte += ti->ti_len); 152 m_freem(m); 153 /* 154 * Try to present any queued data 155 * at the left window edge to the user. 156 * This is needed after the 3-WHS 157 * completes. 158 */ 159 goto present; /* ??? */ 160 } 161 m_adj(m, i); 162 ti->ti_len -= i; 163 ti->ti_seq += i; 164 } 165 q = tcpiphdr_next(q); 166 } 167 STAT(tcpstat.tcps_rcvoopack++); 168 STAT(tcpstat.tcps_rcvoobyte += ti->ti_len); 169 ti->ti_mbuf = m; 170 171 /* 172 * While we overlap succeeding segments trim them or, 173 * if they are completely covered, dequeue them. 174 */ 175 while (!tcpfrag_list_end(q, tp)) { 176 register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq; 177 if (i <= 0) 178 break; 179 if (i < q->ti_len) { 180 q->ti_seq += i; 181 q->ti_len -= i; 182 m_adj(q->ti_mbuf, i); 183 break; 184 } 185 q = tcpiphdr_next(q); 186 m = tcpiphdr_prev(q)->ti_mbuf; 187 remque(tcpiphdr2qlink(tcpiphdr_prev(q))); 188 m_freem(m); 189 } 190 191 /* 192 * Stick new segment in its place. 193 */ 194 insque(tcpiphdr2qlink(ti), tcpiphdr2qlink(tcpiphdr_prev(q))); 195 196 present: 197 /* 198 * Present data to user, advancing rcv_nxt through 199 * completed sequence space. 200 */ 201 if (!TCPS_HAVEESTABLISHED(tp->t_state)) 202 return (0); 203 ti = tcpfrag_list_first(tp); 204 if (tcpfrag_list_end(ti, tp) || ti->ti_seq != tp->rcv_nxt) 205 return (0); 206 if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len) 207 return (0); 208 do { 209 tp->rcv_nxt += ti->ti_len; 210 flags = ti->ti_flags & TH_FIN; 211 remque(tcpiphdr2qlink(ti)); 212 m = ti->ti_mbuf; 213 ti = tcpiphdr_next(ti); 214 /* if (so->so_state & SS_FCANTRCVMORE) */ 215 if (so->so_state & SS_FCANTSENDMORE) 216 m_freem(m); 217 else { 218 if (so->so_emu) { 219 if (tcp_emu(so,m)) sbappend(so, m); 220 } else 221 sbappend(so, m); 222 } 223 } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt); 224 /* sorwakeup(so); */ 225 return (flags); 226 } 227 228 /* 229 * TCP input routine, follows pages 65-76 of the 230 * protocol specification dated September, 1981 very closely. 231 */ 232 void 233 tcp_input(struct mbuf *m, int iphlen, struct socket *inso) 234 { 235 struct ip save_ip, *ip; 236 register struct tcpiphdr *ti; 237 caddr_t optp = NULL; 238 int optlen = 0; 239 int len, tlen, off; 240 register struct tcpcb *tp = NULL; 241 register int tiflags; 242 struct socket *so = NULL; 243 int todrop, acked, ourfinisacked, needoutput = 0; 244 /* int dropsocket = 0; */ 245 int iss = 0; 246 u_long tiwin; 247 int ret; 248 /* int ts_present = 0; */ 249 struct ex_list *ex_ptr; 250 251 DEBUG_CALL("tcp_input"); 252 DEBUG_ARGS((dfd," m = %8lx iphlen = %2d inso = %lx\n", 253 (long )m, iphlen, (long )inso )); 254 255 /* 256 * If called with m == 0, then we're continuing the connect 257 */ 258 if (m == NULL) { 259 so = inso; 260 261 /* Re-set a few variables */ 262 tp = sototcpcb(so); 263 m = so->so_m; 264 so->so_m = NULL; 265 ti = so->so_ti; 266 tiwin = ti->ti_win; 267 tiflags = ti->ti_flags; 268 269 goto cont_conn; 270 } 271 272 273 STAT(tcpstat.tcps_rcvtotal++); 274 /* 275 * Get IP and TCP header together in first mbuf. 276 * Note: IP leaves IP header in first mbuf. 277 */ 278 ti = mtod(m, struct tcpiphdr *); 279 if (iphlen > sizeof(struct ip )) { 280 ip_stripoptions(m, (struct mbuf *)0); 281 iphlen=sizeof(struct ip ); 282 } 283 /* XXX Check if too short */ 284 285 286 /* 287 * Save a copy of the IP header in case we want restore it 288 * for sending an ICMP error message in response. 289 */ 290 ip=mtod(m, struct ip *); 291 save_ip = *ip; 292 save_ip.ip_len+= iphlen; 293 294 /* 295 * Checksum extended TCP header and data. 296 */ 297 tlen = ((struct ip *)ti)->ip_len; 298 tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; 299 memset(&ti->ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr)); 300 ti->ti_x1 = 0; 301 ti->ti_len = htons((u_int16_t)tlen); 302 len = sizeof(struct ip ) + tlen; 303 /* keep checksum for ICMP reply 304 * ti->ti_sum = cksum(m, len); 305 * if (ti->ti_sum) { */ 306 if(cksum(m, len)) { 307 STAT(tcpstat.tcps_rcvbadsum++); 308 goto drop; 309 } 310 311 /* 312 * Check that TCP offset makes sense, 313 * pull out TCP options and adjust length. XXX 314 */ 315 off = ti->ti_off << 2; 316 if (off < sizeof (struct tcphdr) || off > tlen) { 317 STAT(tcpstat.tcps_rcvbadoff++); 318 goto drop; 319 } 320 tlen -= off; 321 ti->ti_len = tlen; 322 if (off > sizeof (struct tcphdr)) { 323 optlen = off - sizeof (struct tcphdr); 324 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); 325 326 /* 327 * Do quick retrieval of timestamp options ("options 328 * prediction?"). If timestamp is the only option and it's 329 * formatted as recommended in RFC 1323 appendix A, we 330 * quickly get the values now and not bother calling 331 * tcp_dooptions(), etc. 332 */ 333 /* if ((optlen == TCPOLEN_TSTAMP_APPA || 334 * (optlen > TCPOLEN_TSTAMP_APPA && 335 * optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 336 * *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 337 * (ti->ti_flags & TH_SYN) == 0) { 338 * ts_present = 1; 339 * ts_val = ntohl(*(u_int32_t *)(optp + 4)); 340 * ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 341 * optp = NULL; / * we've parsed the options * / 342 * } 343 */ 344 } 345 tiflags = ti->ti_flags; 346 347 /* 348 * Convert TCP protocol specific fields to host format. 349 */ 350 NTOHL(ti->ti_seq); 351 NTOHL(ti->ti_ack); 352 NTOHS(ti->ti_win); 353 NTOHS(ti->ti_urp); 354 355 /* 356 * Drop TCP, IP headers and TCP options. 357 */ 358 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 359 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 360 361 if (slirp_restrict) { 362 for (ex_ptr = exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) 363 if (ex_ptr->ex_fport == ti->ti_dport && 364 (ntohl(ti->ti_dst.s_addr) & 0xff) == ex_ptr->ex_addr) 365 break; 366 367 if (!ex_ptr) 368 goto drop; 369 } 370 /* 371 * Locate pcb for segment. 372 */ 373 findso: 374 so = tcp_last_so; 375 if (so->so_fport != ti->ti_dport || 376 so->so_lport != ti->ti_sport || 377 so->so_laddr.s_addr != ti->ti_src.s_addr || 378 so->so_faddr.s_addr != ti->ti_dst.s_addr) { 379 so = solookup(&tcb, ti->ti_src, ti->ti_sport, 380 ti->ti_dst, ti->ti_dport); 381 if (so) 382 tcp_last_so = so; 383 STAT(tcpstat.tcps_socachemiss++); 384 } 385 386 /* 387 * If the state is CLOSED (i.e., TCB does not exist) then 388 * all data in the incoming segment is discarded. 389 * If the TCB exists but is in CLOSED state, it is embryonic, 390 * but should either do a listen or a connect soon. 391 * 392 * state == CLOSED means we've done socreate() but haven't 393 * attached it to a protocol yet... 394 * 395 * XXX If a TCB does not exist, and the TH_SYN flag is 396 * the only flag set, then create a session, mark it 397 * as if it was LISTENING, and continue... 398 */ 399 if (so == NULL) { 400 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN) 401 goto dropwithreset; 402 403 if ((so = socreate()) == NULL) 404 goto dropwithreset; 405 if (tcp_attach(so) < 0) { 406 free(so); /* Not sofree (if it failed, it's not insqued) */ 407 goto dropwithreset; 408 } 409 410 sbreserve(&so->so_snd, TCP_SNDSPACE); 411 sbreserve(&so->so_rcv, TCP_RCVSPACE); 412 413 /* tcp_last_so = so; */ /* XXX ? */ 414 /* tp = sototcpcb(so); */ 415 416 so->so_laddr = ti->ti_src; 417 so->so_lport = ti->ti_sport; 418 so->so_faddr = ti->ti_dst; 419 so->so_fport = ti->ti_dport; 420 421 if ((so->so_iptos = tcp_tos(so)) == 0) 422 so->so_iptos = ((struct ip *)ti)->ip_tos; 423 424 tp = sototcpcb(so); 425 tp->t_state = TCPS_LISTEN; 426 } 427 428 /* 429 * If this is a still-connecting socket, this probably 430 * a retransmit of the SYN. Whether it's a retransmit SYN 431 * or something else, we nuke it. 432 */ 433 if (so->so_state & SS_ISFCONNECTING) 434 goto drop; 435 436 tp = sototcpcb(so); 437 438 /* XXX Should never fail */ 439 if (tp == NULL) 440 goto dropwithreset; 441 if (tp->t_state == TCPS_CLOSED) 442 goto drop; 443 444 /* Unscale the window into a 32-bit value. */ 445 /* if ((tiflags & TH_SYN) == 0) 446 * tiwin = ti->ti_win << tp->snd_scale; 447 * else 448 */ 449 tiwin = ti->ti_win; 450 451 /* 452 * Segment received on connection. 453 * Reset idle time and keep-alive timer. 454 */ 455 tp->t_idle = 0; 456 if (SO_OPTIONS) 457 tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; 458 else 459 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; 460 461 /* 462 * Process options if not in LISTEN state, 463 * else do it below (after getting remote address). 464 */ 465 if (optp && tp->t_state != TCPS_LISTEN) 466 tcp_dooptions(tp, (u_char *)optp, optlen, ti); 467 /* , */ 468 /* &ts_present, &ts_val, &ts_ecr); */ 469 470 /* 471 * Header prediction: check for the two common cases 472 * of a uni-directional data xfer. If the packet has 473 * no control flags, is in-sequence, the window didn't 474 * change and we're not retransmitting, it's a 475 * candidate. If the length is zero and the ack moved 476 * forward, we're the sender side of the xfer. Just 477 * free the data acked & wake any higher level process 478 * that was blocked waiting for space. If the length 479 * is non-zero and the ack didn't move, we're the 480 * receiver side. If we're getting packets in-order 481 * (the reassembly queue is empty), add the data to 482 * the socket buffer and note that we need a delayed ack. 483 * 484 * XXX Some of these tests are not needed 485 * eg: the tiwin == tp->snd_wnd prevents many more 486 * predictions.. with no *real* advantage.. 487 */ 488 if (tp->t_state == TCPS_ESTABLISHED && 489 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 490 /* (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && */ 491 ti->ti_seq == tp->rcv_nxt && 492 tiwin && tiwin == tp->snd_wnd && 493 tp->snd_nxt == tp->snd_max) { 494 /* 495 * If last ACK falls within this segment's sequence numbers, 496 * record the timestamp. 497 */ 498 /* if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && 499 * SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) { 500 * tp->ts_recent_age = tcp_now; 501 * tp->ts_recent = ts_val; 502 * } 503 */ 504 if (ti->ti_len == 0) { 505 if (SEQ_GT(ti->ti_ack, tp->snd_una) && 506 SEQ_LEQ(ti->ti_ack, tp->snd_max) && 507 tp->snd_cwnd >= tp->snd_wnd) { 508 /* 509 * this is a pure ack for outstanding data. 510 */ 511 STAT(tcpstat.tcps_predack++); 512 /* if (ts_present) 513 * tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 514 * else 515 */ if (tp->t_rtt && 516 SEQ_GT(ti->ti_ack, tp->t_rtseq)) 517 tcp_xmit_timer(tp, tp->t_rtt); 518 acked = ti->ti_ack - tp->snd_una; 519 STAT(tcpstat.tcps_rcvackpack++); 520 STAT(tcpstat.tcps_rcvackbyte += acked); 521 sbdrop(&so->so_snd, acked); 522 tp->snd_una = ti->ti_ack; 523 m_freem(m); 524 525 /* 526 * If all outstanding data are acked, stop 527 * retransmit timer, otherwise restart timer 528 * using current (possibly backed-off) value. 529 * If process is waiting for space, 530 * wakeup/selwakeup/signal. If data 531 * are ready to send, let tcp_output 532 * decide between more output or persist. 533 */ 534 if (tp->snd_una == tp->snd_max) 535 tp->t_timer[TCPT_REXMT] = 0; 536 else if (tp->t_timer[TCPT_PERSIST] == 0) 537 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 538 539 /* 540 * There's room in so_snd, sowwakup will read() 541 * from the socket if we can 542 */ 543 /* if (so->so_snd.sb_flags & SB_NOTIFY) 544 * sowwakeup(so); 545 */ 546 /* 547 * This is called because sowwakeup might have 548 * put data into so_snd. Since we don't so sowwakeup, 549 * we don't need this.. XXX??? 550 */ 551 if (so->so_snd.sb_cc) 552 (void) tcp_output(tp); 553 554 return; 555 } 556 } else if (ti->ti_ack == tp->snd_una && 557 tcpfrag_list_empty(tp) && 558 ti->ti_len <= sbspace(&so->so_rcv)) { 559 /* 560 * this is a pure, in-sequence data packet 561 * with nothing on the reassembly queue and 562 * we have enough buffer space to take it. 563 */ 564 STAT(tcpstat.tcps_preddat++); 565 tp->rcv_nxt += ti->ti_len; 566 STAT(tcpstat.tcps_rcvpack++); 567 STAT(tcpstat.tcps_rcvbyte += ti->ti_len); 568 /* 569 * Add data to socket buffer. 570 */ 571 if (so->so_emu) { 572 if (tcp_emu(so,m)) sbappend(so, m); 573 } else 574 sbappend(so, m); 575 576 /* 577 * XXX This is called when data arrives. Later, check 578 * if we can actually write() to the socket 579 * XXX Need to check? It's be NON_BLOCKING 580 */ 581 /* sorwakeup(so); */ 582 583 /* 584 * If this is a short packet, then ACK now - with Nagel 585 * congestion avoidance sender won't send more until 586 * he gets an ACK. 587 * 588 * It is better to not delay acks at all to maximize 589 * TCP throughput. See RFC 2581. 590 */ 591 tp->t_flags |= TF_ACKNOW; 592 tcp_output(tp); 593 return; 594 } 595 } /* header prediction */ 596 /* 597 * Calculate amount of space in receive window, 598 * and then do TCP input processing. 599 * Receive window is amount of space in rcv queue, 600 * but not less than advertised window. 601 */ 602 { int win; 603 win = sbspace(&so->so_rcv); 604 if (win < 0) 605 win = 0; 606 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 607 } 608 609 switch (tp->t_state) { 610 611 /* 612 * If the state is LISTEN then ignore segment if it contains an RST. 613 * If the segment contains an ACK then it is bad and send a RST. 614 * If it does not contain a SYN then it is not interesting; drop it. 615 * Don't bother responding if the destination was a broadcast. 616 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial 617 * tp->iss, and send a segment: 618 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 619 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. 620 * Fill in remote peer address fields if not previously specified. 621 * Enter SYN_RECEIVED state, and process any other fields of this 622 * segment in this state. 623 */ 624 case TCPS_LISTEN: { 625 626 if (tiflags & TH_RST) 627 goto drop; 628 if (tiflags & TH_ACK) 629 goto dropwithreset; 630 if ((tiflags & TH_SYN) == 0) 631 goto drop; 632 633 /* 634 * This has way too many gotos... 635 * But a bit of spaghetti code never hurt anybody :) 636 */ 637 638 /* 639 * If this is destined for the control address, then flag to 640 * tcp_ctl once connected, otherwise connect 641 */ 642 if ((so->so_faddr.s_addr&htonl(0xffffff00)) == special_addr.s_addr) { 643 int lastbyte=ntohl(so->so_faddr.s_addr) & 0xff; 644 if (lastbyte!=CTL_ALIAS && lastbyte!=CTL_DNS) { 645 #if 0 646 if(lastbyte==CTL_CMD || lastbyte==CTL_EXEC) { 647 /* Command or exec adress */ 648 so->so_state |= SS_CTL; 649 } else 650 #endif 651 { 652 /* May be an add exec */ 653 for(ex_ptr = exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { 654 if(ex_ptr->ex_fport == so->so_fport && 655 lastbyte == ex_ptr->ex_addr) { 656 so->so_state |= SS_CTL; 657 break; 658 } 659 } 660 } 661 if(so->so_state & SS_CTL) goto cont_input; 662 } 663 /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */ 664 } 665 666 if (so->so_emu & EMU_NOCONNECT) { 667 so->so_emu &= ~EMU_NOCONNECT; 668 goto cont_input; 669 } 670 671 if((tcp_fconnect(so) == -1) && (errno != EINPROGRESS) && (errno != EWOULDBLOCK)) { 672 u_char code=ICMP_UNREACH_NET; 673 DEBUG_MISC((dfd," tcp fconnect errno = %d-%s\n", 674 errno,strerror(errno))); 675 if(errno == ECONNREFUSED) { 676 /* ACK the SYN, send RST to refuse the connection */ 677 tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0, 678 TH_RST|TH_ACK); 679 } else { 680 if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST; 681 HTONL(ti->ti_seq); /* restore tcp header */ 682 HTONL(ti->ti_ack); 683 HTONS(ti->ti_win); 684 HTONS(ti->ti_urp); 685 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 686 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 687 *ip=save_ip; 688 icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno)); 689 } 690 tp = tcp_close(tp); 691 m_free(m); 692 } else { 693 /* 694 * Haven't connected yet, save the current mbuf 695 * and ti, and return 696 * XXX Some OS's don't tell us whether the connect() 697 * succeeded or not. So we must time it out. 698 */ 699 so->so_m = m; 700 so->so_ti = ti; 701 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 702 tp->t_state = TCPS_SYN_RECEIVED; 703 } 704 return; 705 706 cont_conn: 707 /* m==NULL 708 * Check if the connect succeeded 709 */ 710 if (so->so_state & SS_NOFDREF) { 711 tp = tcp_close(tp); 712 goto dropwithreset; 713 } 714 cont_input: 715 tcp_template(tp); 716 717 if (optp) 718 tcp_dooptions(tp, (u_char *)optp, optlen, ti); 719 /* , */ 720 /* &ts_present, &ts_val, &ts_ecr); */ 721 722 if (iss) 723 tp->iss = iss; 724 else 725 tp->iss = tcp_iss; 726 tcp_iss += TCP_ISSINCR/2; 727 tp->irs = ti->ti_seq; 728 tcp_sendseqinit(tp); 729 tcp_rcvseqinit(tp); 730 tp->t_flags |= TF_ACKNOW; 731 tp->t_state = TCPS_SYN_RECEIVED; 732 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 733 STAT(tcpstat.tcps_accepts++); 734 goto trimthenstep6; 735 } /* case TCPS_LISTEN */ 736 737 /* 738 * If the state is SYN_SENT: 739 * if seg contains an ACK, but not for our SYN, drop the input. 740 * if seg contains a RST, then drop the connection. 741 * if seg does not contain SYN, then drop it. 742 * Otherwise this is an acceptable SYN segment 743 * initialize tp->rcv_nxt and tp->irs 744 * if seg contains ack then advance tp->snd_una 745 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 746 * arrange for segment to be acked (eventually) 747 * continue processing rest of data/controls, beginning with URG 748 */ 749 case TCPS_SYN_SENT: 750 if ((tiflags & TH_ACK) && 751 (SEQ_LEQ(ti->ti_ack, tp->iss) || 752 SEQ_GT(ti->ti_ack, tp->snd_max))) 753 goto dropwithreset; 754 755 if (tiflags & TH_RST) { 756 if (tiflags & TH_ACK) 757 tp = tcp_drop(tp,0); /* XXX Check t_softerror! */ 758 goto drop; 759 } 760 761 if ((tiflags & TH_SYN) == 0) 762 goto drop; 763 if (tiflags & TH_ACK) { 764 tp->snd_una = ti->ti_ack; 765 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 766 tp->snd_nxt = tp->snd_una; 767 } 768 769 tp->t_timer[TCPT_REXMT] = 0; 770 tp->irs = ti->ti_seq; 771 tcp_rcvseqinit(tp); 772 tp->t_flags |= TF_ACKNOW; 773 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 774 STAT(tcpstat.tcps_connects++); 775 soisfconnected(so); 776 tp->t_state = TCPS_ESTABLISHED; 777 778 /* Do window scaling on this connection? */ 779 /* if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 780 * (TF_RCVD_SCALE|TF_REQ_SCALE)) { 781 * tp->snd_scale = tp->requested_s_scale; 782 * tp->rcv_scale = tp->request_r_scale; 783 * } 784 */ 785 (void) tcp_reass(tp, (struct tcpiphdr *)0, 786 (struct mbuf *)0); 787 /* 788 * if we didn't have to retransmit the SYN, 789 * use its rtt as our initial srtt & rtt var. 790 */ 791 if (tp->t_rtt) 792 tcp_xmit_timer(tp, tp->t_rtt); 793 } else 794 tp->t_state = TCPS_SYN_RECEIVED; 795 796 trimthenstep6: 797 /* 798 * Advance ti->ti_seq to correspond to first data byte. 799 * If data, trim to stay within window, 800 * dropping FIN if necessary. 801 */ 802 ti->ti_seq++; 803 if (ti->ti_len > tp->rcv_wnd) { 804 todrop = ti->ti_len - tp->rcv_wnd; 805 m_adj(m, -todrop); 806 ti->ti_len = tp->rcv_wnd; 807 tiflags &= ~TH_FIN; 808 STAT(tcpstat.tcps_rcvpackafterwin++); 809 STAT(tcpstat.tcps_rcvbyteafterwin += todrop); 810 } 811 tp->snd_wl1 = ti->ti_seq - 1; 812 tp->rcv_up = ti->ti_seq; 813 goto step6; 814 } /* switch tp->t_state */ 815 /* 816 * States other than LISTEN or SYN_SENT. 817 * First check timestamp, if present. 818 * Then check that at least some bytes of segment are within 819 * receive window. If segment begins before rcv_nxt, 820 * drop leading data (and SYN); if nothing left, just ack. 821 * 822 * RFC 1323 PAWS: If we have a timestamp reply on this segment 823 * and it's less than ts_recent, drop it. 824 */ 825 /* if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 826 * TSTMP_LT(ts_val, tp->ts_recent)) { 827 * 828 */ /* Check to see if ts_recent is over 24 days old. */ 829 /* if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 830 */ /* 831 * * Invalidate ts_recent. If this segment updates 832 * * ts_recent, the age will be reset later and ts_recent 833 * * will get a valid value. If it does not, setting 834 * * ts_recent to zero will at least satisfy the 835 * * requirement that zero be placed in the timestamp 836 * * echo reply when ts_recent isn't valid. The 837 * * age isn't reset until we get a valid ts_recent 838 * * because we don't want out-of-order segments to be 839 * * dropped when ts_recent is old. 840 * */ 841 /* tp->ts_recent = 0; 842 * } else { 843 * tcpstat.tcps_rcvduppack++; 844 * tcpstat.tcps_rcvdupbyte += ti->ti_len; 845 * tcpstat.tcps_pawsdrop++; 846 * goto dropafterack; 847 * } 848 * } 849 */ 850 851 todrop = tp->rcv_nxt - ti->ti_seq; 852 if (todrop > 0) { 853 if (tiflags & TH_SYN) { 854 tiflags &= ~TH_SYN; 855 ti->ti_seq++; 856 if (ti->ti_urp > 1) 857 ti->ti_urp--; 858 else 859 tiflags &= ~TH_URG; 860 todrop--; 861 } 862 /* 863 * Following if statement from Stevens, vol. 2, p. 960. 864 */ 865 if (todrop > ti->ti_len 866 || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { 867 /* 868 * Any valid FIN must be to the left of the window. 869 * At this point the FIN must be a duplicate or out 870 * of sequence; drop it. 871 */ 872 tiflags &= ~TH_FIN; 873 874 /* 875 * Send an ACK to resynchronize and drop any data. 876 * But keep on processing for RST or ACK. 877 */ 878 tp->t_flags |= TF_ACKNOW; 879 todrop = ti->ti_len; 880 STAT(tcpstat.tcps_rcvduppack++); 881 STAT(tcpstat.tcps_rcvdupbyte += todrop); 882 } else { 883 STAT(tcpstat.tcps_rcvpartduppack++); 884 STAT(tcpstat.tcps_rcvpartdupbyte += todrop); 885 } 886 m_adj(m, todrop); 887 ti->ti_seq += todrop; 888 ti->ti_len -= todrop; 889 if (ti->ti_urp > todrop) 890 ti->ti_urp -= todrop; 891 else { 892 tiflags &= ~TH_URG; 893 ti->ti_urp = 0; 894 } 895 } 896 /* 897 * If new data are received on a connection after the 898 * user processes are gone, then RST the other end. 899 */ 900 if ((so->so_state & SS_NOFDREF) && 901 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { 902 tp = tcp_close(tp); 903 STAT(tcpstat.tcps_rcvafterclose++); 904 goto dropwithreset; 905 } 906 907 /* 908 * If segment ends after window, drop trailing data 909 * (and PUSH and FIN); if nothing left, just ACK. 910 */ 911 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); 912 if (todrop > 0) { 913 STAT(tcpstat.tcps_rcvpackafterwin++); 914 if (todrop >= ti->ti_len) { 915 STAT(tcpstat.tcps_rcvbyteafterwin += ti->ti_len); 916 /* 917 * If a new connection request is received 918 * while in TIME_WAIT, drop the old connection 919 * and start over if the sequence numbers 920 * are above the previous ones. 921 */ 922 if (tiflags & TH_SYN && 923 tp->t_state == TCPS_TIME_WAIT && 924 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { 925 iss = tp->rcv_nxt + TCP_ISSINCR; 926 tp = tcp_close(tp); 927 goto findso; 928 } 929 /* 930 * If window is closed can only take segments at 931 * window edge, and have to drop data and PUSH from 932 * incoming segments. Continue processing, but 933 * remember to ack. Otherwise, drop segment 934 * and ack. 935 */ 936 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { 937 tp->t_flags |= TF_ACKNOW; 938 STAT(tcpstat.tcps_rcvwinprobe++); 939 } else 940 goto dropafterack; 941 } else 942 STAT(tcpstat.tcps_rcvbyteafterwin += todrop); 943 m_adj(m, -todrop); 944 ti->ti_len -= todrop; 945 tiflags &= ~(TH_PUSH|TH_FIN); 946 } 947 948 /* 949 * If last ACK falls within this segment's sequence numbers, 950 * record its timestamp. 951 */ 952 /* if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && 953 * SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + 954 * ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 955 * tp->ts_recent_age = tcp_now; 956 * tp->ts_recent = ts_val; 957 * } 958 */ 959 960 /* 961 * If the RST bit is set examine the state: 962 * SYN_RECEIVED STATE: 963 * If passive open, return to LISTEN state. 964 * If active open, inform user that connection was refused. 965 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 966 * Inform user that connection was reset, and close tcb. 967 * CLOSING, LAST_ACK, TIME_WAIT STATES 968 * Close the tcb. 969 */ 970 if (tiflags&TH_RST) switch (tp->t_state) { 971 972 case TCPS_SYN_RECEIVED: 973 /* so->so_error = ECONNREFUSED; */ 974 goto close; 975 976 case TCPS_ESTABLISHED: 977 case TCPS_FIN_WAIT_1: 978 case TCPS_FIN_WAIT_2: 979 case TCPS_CLOSE_WAIT: 980 /* so->so_error = ECONNRESET; */ 981 close: 982 tp->t_state = TCPS_CLOSED; 983 STAT(tcpstat.tcps_drops++); 984 tp = tcp_close(tp); 985 goto drop; 986 987 case TCPS_CLOSING: 988 case TCPS_LAST_ACK: 989 case TCPS_TIME_WAIT: 990 tp = tcp_close(tp); 991 goto drop; 992 } 993 994 /* 995 * If a SYN is in the window, then this is an 996 * error and we send an RST and drop the connection. 997 */ 998 if (tiflags & TH_SYN) { 999 tp = tcp_drop(tp,0); 1000 goto dropwithreset; 1001 } 1002 1003 /* 1004 * If the ACK bit is off we drop the segment and return. 1005 */ 1006 if ((tiflags & TH_ACK) == 0) goto drop; 1007 1008 /* 1009 * Ack processing. 1010 */ 1011 switch (tp->t_state) { 1012 /* 1013 * In SYN_RECEIVED state if the ack ACKs our SYN then enter 1014 * ESTABLISHED state and continue processing, otherwise 1015 * send an RST. una<=ack<=max 1016 */ 1017 case TCPS_SYN_RECEIVED: 1018 1019 if (SEQ_GT(tp->snd_una, ti->ti_ack) || 1020 SEQ_GT(ti->ti_ack, tp->snd_max)) 1021 goto dropwithreset; 1022 STAT(tcpstat.tcps_connects++); 1023 tp->t_state = TCPS_ESTABLISHED; 1024 /* 1025 * The sent SYN is ack'ed with our sequence number +1 1026 * The first data byte already in the buffer will get 1027 * lost if no correction is made. This is only needed for 1028 * SS_CTL since the buffer is empty otherwise. 1029 * tp->snd_una++; or: 1030 */ 1031 tp->snd_una=ti->ti_ack; 1032 if (so->so_state & SS_CTL) { 1033 /* So tcp_ctl reports the right state */ 1034 ret = tcp_ctl(so); 1035 if (ret == 1) { 1036 soisfconnected(so); 1037 so->so_state &= ~SS_CTL; /* success XXX */ 1038 } else if (ret == 2) { 1039 so->so_state = SS_NOFDREF; /* CTL_CMD */ 1040 } else { 1041 needoutput = 1; 1042 tp->t_state = TCPS_FIN_WAIT_1; 1043 } 1044 } else { 1045 soisfconnected(so); 1046 } 1047 1048 /* Do window scaling? */ 1049 /* if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1050 * (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1051 * tp->snd_scale = tp->requested_s_scale; 1052 * tp->rcv_scale = tp->request_r_scale; 1053 * } 1054 */ 1055 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); 1056 tp->snd_wl1 = ti->ti_seq - 1; 1057 /* Avoid ack processing; snd_una==ti_ack => dup ack */ 1058 goto synrx_to_est; 1059 /* fall into ... */ 1060 1061 /* 1062 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1063 * ACKs. If the ack is in the range 1064 * tp->snd_una < ti->ti_ack <= tp->snd_max 1065 * then advance tp->snd_una to ti->ti_ack and drop 1066 * data from the retransmission queue. If this ACK reflects 1067 * more up to date window information we update our window information. 1068 */ 1069 case TCPS_ESTABLISHED: 1070 case TCPS_FIN_WAIT_1: 1071 case TCPS_FIN_WAIT_2: 1072 case TCPS_CLOSE_WAIT: 1073 case TCPS_CLOSING: 1074 case TCPS_LAST_ACK: 1075 case TCPS_TIME_WAIT: 1076 1077 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { 1078 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { 1079 STAT(tcpstat.tcps_rcvdupack++); 1080 DEBUG_MISC((dfd," dup ack m = %lx so = %lx \n", 1081 (long )m, (long )so)); 1082 /* 1083 * If we have outstanding data (other than 1084 * a window probe), this is a completely 1085 * duplicate ack (ie, window info didn't 1086 * change), the ack is the biggest we've 1087 * seen and we've seen exactly our rexmt 1088 * threshold of them, assume a packet 1089 * has been dropped and retransmit it. 1090 * Kludge snd_nxt & the congestion 1091 * window so we send only this one 1092 * packet. 1093 * 1094 * We know we're losing at the current 1095 * window size so do congestion avoidance 1096 * (set ssthresh to half the current window 1097 * and pull our congestion window back to 1098 * the new ssthresh). 1099 * 1100 * Dup acks mean that packets have left the 1101 * network (they're now cached at the receiver) 1102 * so bump cwnd by the amount in the receiver 1103 * to keep a constant cwnd packets in the 1104 * network. 1105 */ 1106 if (tp->t_timer[TCPT_REXMT] == 0 || 1107 ti->ti_ack != tp->snd_una) 1108 tp->t_dupacks = 0; 1109 else if (++tp->t_dupacks == TCPREXMTTHRESH) { 1110 tcp_seq onxt = tp->snd_nxt; 1111 u_int win = 1112 min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1113 tp->t_maxseg; 1114 1115 if (win < 2) 1116 win = 2; 1117 tp->snd_ssthresh = win * tp->t_maxseg; 1118 tp->t_timer[TCPT_REXMT] = 0; 1119 tp->t_rtt = 0; 1120 tp->snd_nxt = ti->ti_ack; 1121 tp->snd_cwnd = tp->t_maxseg; 1122 (void) tcp_output(tp); 1123 tp->snd_cwnd = tp->snd_ssthresh + 1124 tp->t_maxseg * tp->t_dupacks; 1125 if (SEQ_GT(onxt, tp->snd_nxt)) 1126 tp->snd_nxt = onxt; 1127 goto drop; 1128 } else if (tp->t_dupacks > TCPREXMTTHRESH) { 1129 tp->snd_cwnd += tp->t_maxseg; 1130 (void) tcp_output(tp); 1131 goto drop; 1132 } 1133 } else 1134 tp->t_dupacks = 0; 1135 break; 1136 } 1137 synrx_to_est: 1138 /* 1139 * If the congestion window was inflated to account 1140 * for the other side's cached packets, retract it. 1141 */ 1142 if (tp->t_dupacks > TCPREXMTTHRESH && 1143 tp->snd_cwnd > tp->snd_ssthresh) 1144 tp->snd_cwnd = tp->snd_ssthresh; 1145 tp->t_dupacks = 0; 1146 if (SEQ_GT(ti->ti_ack, tp->snd_max)) { 1147 STAT(tcpstat.tcps_rcvacktoomuch++); 1148 goto dropafterack; 1149 } 1150 acked = ti->ti_ack - tp->snd_una; 1151 STAT(tcpstat.tcps_rcvackpack++); 1152 STAT(tcpstat.tcps_rcvackbyte += acked); 1153 1154 /* 1155 * If we have a timestamp reply, update smoothed 1156 * round trip time. If no timestamp is present but 1157 * transmit timer is running and timed sequence 1158 * number was acked, update smoothed round trip time. 1159 * Since we now have an rtt measurement, cancel the 1160 * timer backoff (cf., Phil Karn's retransmit alg.). 1161 * Recompute the initial retransmit timer. 1162 */ 1163 /* if (ts_present) 1164 * tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 1165 * else 1166 */ 1167 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) 1168 tcp_xmit_timer(tp,tp->t_rtt); 1169 1170 /* 1171 * If all outstanding data is acked, stop retransmit 1172 * timer and remember to restart (more output or persist). 1173 * If there is more data to be acked, restart retransmit 1174 * timer, using current (possibly backed-off) value. 1175 */ 1176 if (ti->ti_ack == tp->snd_max) { 1177 tp->t_timer[TCPT_REXMT] = 0; 1178 needoutput = 1; 1179 } else if (tp->t_timer[TCPT_PERSIST] == 0) 1180 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1181 /* 1182 * When new data is acked, open the congestion window. 1183 * If the window gives us less than ssthresh packets 1184 * in flight, open exponentially (maxseg per packet). 1185 * Otherwise open linearly: maxseg per window 1186 * (maxseg^2 / cwnd per packet). 1187 */ 1188 { 1189 register u_int cw = tp->snd_cwnd; 1190 register u_int incr = tp->t_maxseg; 1191 1192 if (cw > tp->snd_ssthresh) 1193 incr = incr * incr / cw; 1194 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1195 } 1196 if (acked > so->so_snd.sb_cc) { 1197 tp->snd_wnd -= so->so_snd.sb_cc; 1198 sbdrop(&so->so_snd, (int )so->so_snd.sb_cc); 1199 ourfinisacked = 1; 1200 } else { 1201 sbdrop(&so->so_snd, acked); 1202 tp->snd_wnd -= acked; 1203 ourfinisacked = 0; 1204 } 1205 /* 1206 * XXX sowwakup is called when data is acked and there's room for 1207 * for more data... it should read() the socket 1208 */ 1209 /* if (so->so_snd.sb_flags & SB_NOTIFY) 1210 * sowwakeup(so); 1211 */ 1212 tp->snd_una = ti->ti_ack; 1213 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1214 tp->snd_nxt = tp->snd_una; 1215 1216 switch (tp->t_state) { 1217 1218 /* 1219 * In FIN_WAIT_1 STATE in addition to the processing 1220 * for the ESTABLISHED state if our FIN is now acknowledged 1221 * then enter FIN_WAIT_2. 1222 */ 1223 case TCPS_FIN_WAIT_1: 1224 if (ourfinisacked) { 1225 /* 1226 * If we can't receive any more 1227 * data, then closing user can proceed. 1228 * Starting the timer is contrary to the 1229 * specification, but if we don't get a FIN 1230 * we'll hang forever. 1231 */ 1232 if (so->so_state & SS_FCANTRCVMORE) { 1233 soisfdisconnected(so); 1234 tp->t_timer[TCPT_2MSL] = TCP_MAXIDLE; 1235 } 1236 tp->t_state = TCPS_FIN_WAIT_2; 1237 } 1238 break; 1239 1240 /* 1241 * In CLOSING STATE in addition to the processing for 1242 * the ESTABLISHED state if the ACK acknowledges our FIN 1243 * then enter the TIME-WAIT state, otherwise ignore 1244 * the segment. 1245 */ 1246 case TCPS_CLOSING: 1247 if (ourfinisacked) { 1248 tp->t_state = TCPS_TIME_WAIT; 1249 tcp_canceltimers(tp); 1250 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1251 soisfdisconnected(so); 1252 } 1253 break; 1254 1255 /* 1256 * In LAST_ACK, we may still be waiting for data to drain 1257 * and/or to be acked, as well as for the ack of our FIN. 1258 * If our FIN is now acknowledged, delete the TCB, 1259 * enter the closed state and return. 1260 */ 1261 case TCPS_LAST_ACK: 1262 if (ourfinisacked) { 1263 tp = tcp_close(tp); 1264 goto drop; 1265 } 1266 break; 1267 1268 /* 1269 * In TIME_WAIT state the only thing that should arrive 1270 * is a retransmission of the remote FIN. Acknowledge 1271 * it and restart the finack timer. 1272 */ 1273 case TCPS_TIME_WAIT: 1274 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1275 goto dropafterack; 1276 } 1277 } /* switch(tp->t_state) */ 1278 1279 step6: 1280 /* 1281 * Update window information. 1282 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1283 */ 1284 if ((tiflags & TH_ACK) && 1285 (SEQ_LT(tp->snd_wl1, ti->ti_seq) || 1286 (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || 1287 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { 1288 /* keep track of pure window updates */ 1289 if (ti->ti_len == 0 && 1290 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) 1291 STAT(tcpstat.tcps_rcvwinupd++); 1292 tp->snd_wnd = tiwin; 1293 tp->snd_wl1 = ti->ti_seq; 1294 tp->snd_wl2 = ti->ti_ack; 1295 if (tp->snd_wnd > tp->max_sndwnd) 1296 tp->max_sndwnd = tp->snd_wnd; 1297 needoutput = 1; 1298 } 1299 1300 /* 1301 * Process segments with URG. 1302 */ 1303 if ((tiflags & TH_URG) && ti->ti_urp && 1304 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1305 /* 1306 * This is a kludge, but if we receive and accept 1307 * random urgent pointers, we'll crash in 1308 * soreceive. It's hard to imagine someone 1309 * actually wanting to send this much urgent data. 1310 */ 1311 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) { 1312 ti->ti_urp = 0; 1313 tiflags &= ~TH_URG; 1314 goto dodata; 1315 } 1316 /* 1317 * If this segment advances the known urgent pointer, 1318 * then mark the data stream. This should not happen 1319 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1320 * a FIN has been received from the remote side. 1321 * In these states we ignore the URG. 1322 * 1323 * According to RFC961 (Assigned Protocols), 1324 * the urgent pointer points to the last octet 1325 * of urgent data. We continue, however, 1326 * to consider it to indicate the first octet 1327 * of data past the urgent section as the original 1328 * spec states (in one of two places). 1329 */ 1330 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { 1331 tp->rcv_up = ti->ti_seq + ti->ti_urp; 1332 so->so_urgc = so->so_rcv.sb_cc + 1333 (tp->rcv_up - tp->rcv_nxt); /* -1; */ 1334 tp->rcv_up = ti->ti_seq + ti->ti_urp; 1335 1336 } 1337 } else 1338 /* 1339 * If no out of band data is expected, 1340 * pull receive urgent pointer along 1341 * with the receive window. 1342 */ 1343 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1344 tp->rcv_up = tp->rcv_nxt; 1345 dodata: 1346 1347 /* 1348 * Process the segment text, merging it into the TCP sequencing queue, 1349 * and arranging for acknowledgment of receipt if necessary. 1350 * This process logically involves adjusting tp->rcv_wnd as data 1351 * is presented to the user (this happens in tcp_usrreq.c, 1352 * case PRU_RCVD). If a FIN has already been received on this 1353 * connection then we just ignore the text. 1354 */ 1355 if ((ti->ti_len || (tiflags&TH_FIN)) && 1356 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1357 TCP_REASS(tp, ti, m, so, tiflags); 1358 /* 1359 * Note the amount of data that peer has sent into 1360 * our window, in order to estimate the sender's 1361 * buffer size. 1362 */ 1363 len = so->so_rcv.sb_datalen - (tp->rcv_adv - tp->rcv_nxt); 1364 } else { 1365 m_free(m); 1366 tiflags &= ~TH_FIN; 1367 } 1368 1369 /* 1370 * If FIN is received ACK the FIN and let the user know 1371 * that the connection is closing. 1372 */ 1373 if (tiflags & TH_FIN) { 1374 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1375 /* 1376 * If we receive a FIN we can't send more data, 1377 * set it SS_FDRAIN 1378 * Shutdown the socket if there is no rx data in the 1379 * buffer. 1380 * soread() is called on completion of shutdown() and 1381 * will got to TCPS_LAST_ACK, and use tcp_output() 1382 * to send the FIN. 1383 */ 1384 /* sofcantrcvmore(so); */ 1385 sofwdrain(so); 1386 1387 tp->t_flags |= TF_ACKNOW; 1388 tp->rcv_nxt++; 1389 } 1390 switch (tp->t_state) { 1391 1392 /* 1393 * In SYN_RECEIVED and ESTABLISHED STATES 1394 * enter the CLOSE_WAIT state. 1395 */ 1396 case TCPS_SYN_RECEIVED: 1397 case TCPS_ESTABLISHED: 1398 if(so->so_emu == EMU_CTL) /* no shutdown on socket */ 1399 tp->t_state = TCPS_LAST_ACK; 1400 else 1401 tp->t_state = TCPS_CLOSE_WAIT; 1402 break; 1403 1404 /* 1405 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1406 * enter the CLOSING state. 1407 */ 1408 case TCPS_FIN_WAIT_1: 1409 tp->t_state = TCPS_CLOSING; 1410 break; 1411 1412 /* 1413 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1414 * starting the time-wait timer, turning off the other 1415 * standard timers. 1416 */ 1417 case TCPS_FIN_WAIT_2: 1418 tp->t_state = TCPS_TIME_WAIT; 1419 tcp_canceltimers(tp); 1420 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1421 soisfdisconnected(so); 1422 break; 1423 1424 /* 1425 * In TIME_WAIT state restart the 2 MSL time_wait timer. 1426 */ 1427 case TCPS_TIME_WAIT: 1428 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1429 break; 1430 } 1431 } 1432 1433 /* 1434 * If this is a small packet, then ACK now - with Nagel 1435 * congestion avoidance sender won't send more until 1436 * he gets an ACK. 1437 * 1438 * See above. 1439 */ 1440 /* if (ti->ti_len && (unsigned)ti->ti_len < tp->t_maxseg) { 1441 */ 1442 /* if ((ti->ti_len && (unsigned)ti->ti_len < tp->t_maxseg && 1443 * (so->so_iptos & IPTOS_LOWDELAY) == 0) || 1444 * ((so->so_iptos & IPTOS_LOWDELAY) && 1445 * ((struct tcpiphdr_2 *)ti)->first_char == (char)27)) { 1446 */ 1447 if (ti->ti_len && (unsigned)ti->ti_len <= 5 && 1448 ((struct tcpiphdr_2 *)ti)->first_char == (char)27) { 1449 tp->t_flags |= TF_ACKNOW; 1450 } 1451 1452 /* 1453 * Return any desired output. 1454 */ 1455 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 1456 (void) tcp_output(tp); 1457 } 1458 return; 1459 1460 dropafterack: 1461 /* 1462 * Generate an ACK dropping incoming segment if it occupies 1463 * sequence space, where the ACK reflects our state. 1464 */ 1465 if (tiflags & TH_RST) 1466 goto drop; 1467 m_freem(m); 1468 tp->t_flags |= TF_ACKNOW; 1469 (void) tcp_output(tp); 1470 return; 1471 1472 dropwithreset: 1473 /* reuses m if m!=NULL, m_free() unnecessary */ 1474 if (tiflags & TH_ACK) 1475 tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); 1476 else { 1477 if (tiflags & TH_SYN) ti->ti_len++; 1478 tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, 1479 TH_RST|TH_ACK); 1480 } 1481 1482 return; 1483 1484 drop: 1485 /* 1486 * Drop space held by incoming segment and return. 1487 */ 1488 m_free(m); 1489 1490 return; 1491 } 1492 1493 /* , ts_present, ts_val, ts_ecr) */ 1494 /* int *ts_present; 1495 * u_int32_t *ts_val, *ts_ecr; 1496 */ 1497 static void 1498 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti) 1499 { 1500 u_int16_t mss; 1501 int opt, optlen; 1502 1503 DEBUG_CALL("tcp_dooptions"); 1504 DEBUG_ARGS((dfd," tp = %lx cnt=%i \n", (long )tp, cnt)); 1505 1506 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1507 opt = cp[0]; 1508 if (opt == TCPOPT_EOL) 1509 break; 1510 if (opt == TCPOPT_NOP) 1511 optlen = 1; 1512 else { 1513 optlen = cp[1]; 1514 if (optlen <= 0) 1515 break; 1516 } 1517 switch (opt) { 1518 1519 default: 1520 continue; 1521 1522 case TCPOPT_MAXSEG: 1523 if (optlen != TCPOLEN_MAXSEG) 1524 continue; 1525 if (!(ti->ti_flags & TH_SYN)) 1526 continue; 1527 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss)); 1528 NTOHS(mss); 1529 (void) tcp_mss(tp, mss); /* sets t_maxseg */ 1530 break; 1531 1532 /* case TCPOPT_WINDOW: 1533 * if (optlen != TCPOLEN_WINDOW) 1534 * continue; 1535 * if (!(ti->ti_flags & TH_SYN)) 1536 * continue; 1537 * tp->t_flags |= TF_RCVD_SCALE; 1538 * tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 1539 * break; 1540 */ 1541 /* case TCPOPT_TIMESTAMP: 1542 * if (optlen != TCPOLEN_TIMESTAMP) 1543 * continue; 1544 * *ts_present = 1; 1545 * memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val)); 1546 * NTOHL(*ts_val); 1547 * memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr)); 1548 * NTOHL(*ts_ecr); 1549 * 1550 */ /* 1551 * * A timestamp received in a SYN makes 1552 * * it ok to send timestamp requests and replies. 1553 * */ 1554 /* if (ti->ti_flags & TH_SYN) { 1555 * tp->t_flags |= TF_RCVD_TSTMP; 1556 * tp->ts_recent = *ts_val; 1557 * tp->ts_recent_age = tcp_now; 1558 * } 1559 */ break; 1560 } 1561 } 1562 } 1563 1564 1565 /* 1566 * Pull out of band byte out of a segment so 1567 * it doesn't appear in the user's data queue. 1568 * It is still reflected in the segment length for 1569 * sequencing purposes. 1570 */ 1571 1572 #ifdef notdef 1573 1574 void 1575 tcp_pulloutofband(so, ti, m) 1576 struct socket *so; 1577 struct tcpiphdr *ti; 1578 register struct mbuf *m; 1579 { 1580 int cnt = ti->ti_urp - 1; 1581 1582 while (cnt >= 0) { 1583 if (m->m_len > cnt) { 1584 char *cp = mtod(m, caddr_t) + cnt; 1585 struct tcpcb *tp = sototcpcb(so); 1586 1587 tp->t_iobc = *cp; 1588 tp->t_oobflags |= TCPOOB_HAVEDATA; 1589 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1)); 1590 m->m_len--; 1591 return; 1592 } 1593 cnt -= m->m_len; 1594 m = m->m_next; /* XXX WRONG! Fix it! */ 1595 if (m == 0) 1596 break; 1597 } 1598 panic("tcp_pulloutofband"); 1599 } 1600 1601 #endif /* notdef */ 1602 1603 /* 1604 * Collect new round-trip time estimate 1605 * and update averages and current timeout. 1606 */ 1607 1608 static void 1609 tcp_xmit_timer(register struct tcpcb *tp, int rtt) 1610 { 1611 register short delta; 1612 1613 DEBUG_CALL("tcp_xmit_timer"); 1614 DEBUG_ARG("tp = %lx", (long)tp); 1615 DEBUG_ARG("rtt = %d", rtt); 1616 1617 STAT(tcpstat.tcps_rttupdated++); 1618 if (tp->t_srtt != 0) { 1619 /* 1620 * srtt is stored as fixed point with 3 bits after the 1621 * binary point (i.e., scaled by 8). The following magic 1622 * is equivalent to the smoothing algorithm in rfc793 with 1623 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 1624 * point). Adjust rtt to origin 0. 1625 */ 1626 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT); 1627 if ((tp->t_srtt += delta) <= 0) 1628 tp->t_srtt = 1; 1629 /* 1630 * We accumulate a smoothed rtt variance (actually, a 1631 * smoothed mean difference), then set the retransmit 1632 * timer to smoothed rtt + 4 times the smoothed variance. 1633 * rttvar is stored as fixed point with 2 bits after the 1634 * binary point (scaled by 4). The following is 1635 * equivalent to rfc793 smoothing with an alpha of .75 1636 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 1637 * rfc793's wired-in beta. 1638 */ 1639 if (delta < 0) 1640 delta = -delta; 1641 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 1642 if ((tp->t_rttvar += delta) <= 0) 1643 tp->t_rttvar = 1; 1644 } else { 1645 /* 1646 * No rtt measurement yet - use the unsmoothed rtt. 1647 * Set the variance to half the rtt (so our first 1648 * retransmit happens at 3*rtt). 1649 */ 1650 tp->t_srtt = rtt << TCP_RTT_SHIFT; 1651 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 1652 } 1653 tp->t_rtt = 0; 1654 tp->t_rxtshift = 0; 1655 1656 /* 1657 * the retransmit should happen at rtt + 4 * rttvar. 1658 * Because of the way we do the smoothing, srtt and rttvar 1659 * will each average +1/2 tick of bias. When we compute 1660 * the retransmit timer, we want 1/2 tick of rounding and 1661 * 1 extra tick because of +-1/2 tick uncertainty in the 1662 * firing of the timer. The bias will give us exactly the 1663 * 1.5 tick we need. But, because the bias is 1664 * statistical, we have to test that we don't drop below 1665 * the minimum feasible timer (which is 2 ticks). 1666 */ 1667 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 1668 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */ 1669 1670 /* 1671 * We received an ack for a packet that wasn't retransmitted; 1672 * it is probably safe to discard any error indications we've 1673 * received recently. This isn't quite right, but close enough 1674 * for now (a route might have failed after we sent a segment, 1675 * and the return path might not be symmetrical). 1676 */ 1677 tp->t_softerror = 0; 1678 } 1679 1680 /* 1681 * Determine a reasonable value for maxseg size. 1682 * If the route is known, check route for mtu. 1683 * If none, use an mss that can be handled on the outgoing 1684 * interface without forcing IP to fragment; if bigger than 1685 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 1686 * to utilize large mbufs. If no route is found, route has no mtu, 1687 * or the destination isn't local, use a default, hopefully conservative 1688 * size (usually 512 or the default IP max size, but no more than the mtu 1689 * of the interface), as we can't discover anything about intervening 1690 * gateways or networks. We also initialize the congestion/slow start 1691 * window to be a single segment if the destination isn't local. 1692 * While looking at the routing entry, we also initialize other path-dependent 1693 * parameters from pre-set or cached values in the routing entry. 1694 */ 1695 1696 int 1697 tcp_mss(struct tcpcb *tp, u_int offer) 1698 { 1699 struct socket *so = tp->t_socket; 1700 int mss; 1701 1702 DEBUG_CALL("tcp_mss"); 1703 DEBUG_ARG("tp = %lx", (long)tp); 1704 DEBUG_ARG("offer = %d", offer); 1705 1706 mss = min(IF_MTU, IF_MRU) - sizeof(struct tcpiphdr); 1707 if (offer) 1708 mss = min(mss, offer); 1709 mss = max(mss, 32); 1710 if (mss < tp->t_maxseg || offer != 0) 1711 tp->t_maxseg = mss; 1712 1713 tp->snd_cwnd = mss; 1714 1715 sbreserve(&so->so_snd, TCP_SNDSPACE + ((TCP_SNDSPACE % mss) ? 1716 (mss - (TCP_SNDSPACE % mss)) : 1717 0)); 1718 sbreserve(&so->so_rcv, TCP_RCVSPACE + ((TCP_RCVSPACE % mss) ? 1719 (mss - (TCP_RCVSPACE % mss)) : 1720 0)); 1721 1722 DEBUG_MISC((dfd, " returning mss = %d\n", mss)); 1723 1724 return mss; 1725 } 1726