/* Port to OS X by Rob Braun (bbraun@synack.net)
 *
 * Copyright (c)  1996, 2001 Portland State University
 * All rights reserved.
 *   
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer as
 *     the first lines of this file unmodified.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR/S ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL Portland State University or the authors BE
 *  LIABLE FOR ANY DIRECT, INDIRECT, 
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *            
 *  SACK and FACK implementation in FreeBSD Release-4.3.
 *
 *              Dr. Suresh Singh, Shiv Saxena and Harkirat Singh
 *              Portland State University
 *              Computer Science Dept. - Aug 16, 2001
 *
 *
 *              email: {singh, saxenas, harkirat}@cs.pdx.edu
 *              project page: http://www.cs.pdx.edu/~singh/pacman.html
 *
 *
 */
diff -udr 10.2.3/xnu/bsd/kern/sysctl_init.c xnu/bsd/kern/sysctl_init.c
--- 10.2.3/xnu/bsd/kern/sysctl_init.c	Thu Jan 30 23:52:27 2003
+++ xnu/bsd/kern/sysctl_init.c	Tue Feb 18 14:02:24 2003
@@ -121,6 +121,7 @@
 extern struct sysctl_oid sysctl__net_inet_tcp_slowstart_flightsize;
 extern struct sysctl_oid sysctl__net_inet_tcp_local_slowstart_flightsize;
 extern struct sysctl_oid sysctl__net_inet_tcp_newreno;
+extern struct sysctl_oid sysctl__net_inet_tcp_sack;
 extern struct sysctl_oid sysctl__net_inet_tcp_tcbhashsize;
 extern struct sysctl_oid sysctl__net_inet_tcp_do_tcpdrain;
 extern struct sysctl_oid sysctl__net_inet_tcp_icmp_may_rst;
@@ -455,6 +456,7 @@
     ,&sysctl__net_inet_tcp_delacktime
     ,&sysctl__net_inet_tcp_isn_reseed_interval
     ,&sysctl__net_inet_tcp_msl
+    ,&sysctl__net_inet_tcp_sack
 #if TCP_DROP_SYNFIN
     ,&sysctl__net_inet_tcp_drop_synfin
 #endif
diff -udr 10.2.3/xnu/bsd/netinet/tcp.h xnu/bsd/netinet/tcp.h
--- 10.2.3/xnu/bsd/netinet/tcp.h	Thu Jan 30 23:54:29 2003
+++ xnu/bsd/netinet/tcp.h	Mon Feb 17 20:48:46 2003
@@ -107,12 +107,19 @@
 #define TCPOPT_SACK_PERMITTED	4		/* Experimental */
 #define    TCPOLEN_SACK_PERMITTED	2
 #define TCPOPT_SACK		5		/* Experimental */
+#define TCPOLEN_SACK		8		/* len of sack block */
 #define TCPOPT_TIMESTAMP	8
 #define    TCPOLEN_TIMESTAMP		10
 #define    TCPOLEN_TSTAMP_APPA		(TCPOLEN_TIMESTAMP+2) /* appendix A */
 #define    TCPOPT_TSTAMP_HDR		\
     (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
 
+#define TCPOPT_SACK_PERMIT_HDR \
+ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define TCPOPT_SACK_HDR		(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+#define MAX_SACK_BLKS 6       /* Max # SACK blocks stored at sender side */
+#define TCP_MAX_SACK  3       /* MAX # SACKs sent in any segment */
+
 #define	TCPOPT_CC		11		/* CC options: RFC-1644 */
 #define TCPOPT_CCNEW		12
 #define TCPOPT_CCECHO		13
@@ -155,5 +162,6 @@
 #define	TCP_MAXSEG	0x02	/* set maximum segment size */
 #define TCP_NOPUSH	0x04	/* don't push last block of write */
 #define TCP_NOOPT	0x08	/* don't use TCP options */
+#define TCP_SACK_DISABLE 0x300  /* disable SACKs (if enabled by def.) */
 
 #endif
diff -udr 10.2.3/xnu/bsd/netinet/tcp_input.c xnu/bsd/netinet/tcp_input.c
--- 10.2.3/xnu/bsd/netinet/tcp_input.c	Thu Jan 30 23:54:30 2003
+++ xnu/bsd/netinet/tcp_input.c	Tue Feb 18 15:35:29 2003
@@ -122,7 +122,7 @@
 #define DBG_FNC_TCP_INPUT       NETDBG_CODE(DBG_NETTCP, (3 << 8))
 #define DBG_FNC_TCP_NEWCONN     NETDBG_CODE(DBG_NETTCP, (7 << 8))
 
-static int	tcprexmtthresh = 3;
+int	tcprexmtthresh = 3;
 tcp_cc	tcp_ccgen;
 extern int apple_hwcksum_rx;
 
@@ -974,6 +974,9 @@
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		tp->t_timer[TCPT_KEEP] = tcp_keepidle;
 
+	if (!tp->sack_disable)
+		tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
+
 	/*
 	 * Process options if not in LISTEN state,
 	 * else do it below (after getting remote address).
@@ -981,6 +984,11 @@
 	if (tp->t_state != TCPS_LISTEN && optp)
 		tcp_dooptions(tp, optp, optlen, th, &to);
 
+	if (!tp->sack_disable) {
+		tp->rcv_laststart = th->th_seq;
+		tp->rcv_lastend = th->th_seq + tlen;
+	}
+
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
@@ -1057,6 +1065,9 @@
 				tcpstat.tcps_rcvackbyte += acked;
 				sbdrop(&so->so_snd, acked);
 				tp->snd_una = th->th_ack;
+				tp->snd_last = tp->snd_una;
+				tp->snd_fack = tp->snd_una;
+				tp->retran_data = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* some progress has been done */
 
@@ -1088,6 +1099,8 @@
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
+			if (!tp->sack_disable && tp->rcv_numsacks )
+				tcp_clean_sackreport(tp);
 			++tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			tcpstat.tcps_rcvpack++;
@@ -1249,6 +1262,9 @@
 			bzero(taop, sizeof(*taop));
 		}
 		tcp_dooptions(tp, optp, optlen, th, &to);
+		if (!tp->sack_disable)
+			if ((tp->t_flags & TF_SACK_PERMIT) == 0)
+				tp->sack_disable = 1;
 		if (iss)
 			tp->iss = iss;
 		else {
@@ -1256,6 +1272,10 @@
  		}
 		tp->irs = th->th_seq;
 		tcp_sendseqinit(tp);
+		tp->snd_last = tp->snd_una;
+		tp->snd_fack = tp->snd_una;
+		tp->retran_data = 0;
+		tp->snd_awnd = 0;
 		tcp_rcvseqinit(tp);
 		tp->snd_recover = tp->snd_una;
 		/*
@@ -1428,6 +1448,9 @@
 				}
 			} else
 				tp->t_flags &= ~TF_RCVD_CC;
+			if (!tp->sack_disable)
+				if ((tp->t_flags & TF_SACK_PERMIT) == 0)
+					tp->sack_disable = 1;
 			tcpstat.tcps_connects++;
 			soisconnected(so);
 			/* Do window scaling on this connection? */
@@ -1941,25 +1964,30 @@
 				if (tp->t_timer[TCPT_REXMT] == 0 ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
-				else if (++tp->t_dupacks == tcprexmtthresh) {
+				else if (++tp->t_dupacks == tcprexmtthresh ||
+					((SEQ_GT(tp->snd_fack, tcprexmtthresh *
+						tp->t_maxseg + tp->snd_una)) &&
+					SEQ_GT(tp->snd_una, tp->snd_last))) {
 					tcp_seq onxt = tp->snd_nxt;
 					u_int win =
 					    min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 						tp->t_maxseg;
-					if (tcp_do_newreno && SEQ_LT(th->th_ack,
-					    tp->snd_recover)) {
-						/* False retransmit, should not
-						 * cut window
-						 */
-						tp->snd_cwnd += tp->t_maxseg;
+					if (SEQ_LT(th->th_ack, tp->snd_last)) {
 						tp->t_dupacks = 0;
-						(void) tcp_output(tp);
 						goto drop;
 					}
 					if (win < 2)
 						win = 2;
 					tp->snd_ssthresh = win * tp->t_maxseg;
-					tp->snd_recover = tp->snd_max;
+					tp->snd_last = tp->snd_max;
+					if (!tp->sack_disable) {
+						tp->t_timer[TCPT_REXMT] = 0;
+						tp->t_rtttime = 0;
+						tp->t_dupacks = tcprexmtthresh;
+						(void) tcp_output(tp);
+						tp->snd_cwnd = tp->snd_ssthresh;
+						goto drop;
+					}
 					tp->t_timer[TCPT_REXMT] = 0;
 					tp->t_rtttime = 0;
 					tp->snd_nxt = th->th_ack;
@@ -1971,6 +1999,10 @@
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (tp->t_dupacks > tcprexmtthresh) {
+					if (!tp->sack_disable) {
+						if (tp->snd_awnd < tp->snd_cwnd)
+							tcp_output(tp);
+					}
 					tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
@@ -1983,27 +2015,33 @@
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (tcp_do_newreno == 0) {
-                        if (tp->t_dupacks >= tcprexmtthresh &&
-                                tp->snd_cwnd > tp->snd_ssthresh)
-                                tp->snd_cwnd = tp->snd_ssthresh;
-                        tp->t_dupacks = 0;
-                } else if (tp->t_dupacks >= tcprexmtthresh &&
-		    !tcp_newreno(tp, th)) {
-                        /*
-                         * Window inflation should have left us with approx.
-                         * snd_ssthresh outstanding data.  But in case we
-                         * would be inclined to send a burst, better to do
-                         * it via the slow start mechanism.
-                         */
-			if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
-                                tp->snd_cwnd =
-				    tp->snd_max - th->th_ack + tp->t_maxseg;
-			else
-                        	tp->snd_cwnd = tp->snd_ssthresh;
-                        tp->t_dupacks = 0;
-                }
-
+		if (!tp->sack_disable) {
+			if (tp->t_dupacks >= tcprexmtthresh) {
+				if (tcp_sack_partialack(tp, th)) {
+					if (tp->snd_awnd < tp->snd_cwnd)
+						needoutput = 1;
+				} else {
+					tp->snd_cwnd = tp->snd_ssthresh;
+					if (tcp_seq_subtract(tp->snd_max,
+						th->th_ack) < tp->snd_ssthresh)
+						tp->snd_cwnd = 
+							tcp_seq_subtract(tp->snd_max, th->th_ack);
+					tp->t_dupacks = 0;
+					if (SEQ_GT(th->th_ack, tp->snd_fack))
+						tp->snd_fack = th->th_ack;
+				}
+			}
+		} else {
+			if (tp->t_dupacks >= tcprexmtthresh &&
+					!tcp_newreno(tp, th)) {
+				tp->snd_cwnd = tp->snd_ssthresh;
+				if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
+						tp->snd_ssthresh)
+					tp->snd_cwnd = tcp_seq_subtract(tp->snd_max,
+								th->th_ack);
+				tp->t_dupacks = 0;
+			}
+		}
 		if (tp->t_dupacks < tcprexmtthresh)
 			tp->t_dupacks = 0;
 
@@ -2104,7 +2142,7 @@
 		 * in NewReno fast recovery mode, so we leave the congestion
 		 * window alone.
 		 */
-		if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
+		if (tp->t_dupacks < tcprexmtthresh)
 			tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
 		}
 		if (acked > so->so_snd.sb_cc) {
@@ -2120,6 +2158,11 @@
 		tp->snd_una = th->th_ack;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
+		if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
+			tp->snd_fack = tp->snd_una;
+			tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 
+						tp->snd_fack) + tp->retran_data;
+		}
 
 		switch (tp->t_state) {
 
@@ -2343,6 +2386,10 @@
 			}
 				
 		}
+
+		if (!tp->sack_disable)
+			tcp_update_sack_list(tp);
+
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
@@ -2652,12 +2699,417 @@
 			    (char *)&to->to_ccecho, sizeof(to->to_ccecho));
 			NTOHL(to->to_ccecho);
 			break;
+		case TCPOPT_SACK_PERMITTED:
+			if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED)
+				continue;
+			if (th->th_flags & TH_SYN)
+				tp->t_flags |= TF_SACK_PERMIT;
+			break;
+		case TCPOPT_SACK:
+			if (tcp_sack_option(tp, th, cp, optlen))
+				continue;
+			break;
 		}
 	}
 	if (th->th_flags & TH_SYN)
 		tcp_mss(tp, mss);	/* sets t_maxseg */
 }
 
+u_long
+tcp_seq_subtract(a, b)
+	u_long a, b;
+{
+	return ((long)(a - b));
+}
+
+/*
+ * This function is called upon receipt of new valid data (while not in header
+ * prediction mode), and it updates the ordered list of sacks. 
+ */
+void 
+tcp_update_sack_list(tp)
+	struct tcpcb *tp; 
+{    
+	/* 
+	 * First reported block MUST be the most recent one.  Subsequent
+	 * blocks SHOULD be in the order in which they arrived at the
+	 * receiver.  These two conditions make the implementation fully
+	 * compliant with RFC 2018.
+	 */     
+	int i, j = 0, count = 0, lastpos = -1;
+	struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
+    
+	/* First clean up current list of sacks */
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		sack = tp->sackblks[i];
+		if (sack.start == 0 && sack.end == 0) {
+			count++; /* count = number of blocks to be discarded */
+			continue;
+		}
+		if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
+			tp->sackblks[i].start = tp->sackblks[i].end = 0;
+			count++;
+		} else { 
+			temp[j].start = tp->sackblks[i].start;
+			temp[j++].end = tp->sackblks[i].end;
+		}
+	}   
+	tp->rcv_numsacks -= count;
+	if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
+		tcp_clean_sackreport(tp);
+		if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
+			/* ==> need first sack block */
+			tp->sackblks[0].start = tp->rcv_laststart;
+			tp->sackblks[0].end = tp->rcv_lastend;
+			tp->rcv_numsacks = 1;
+		}
+		return;
+	}
+	/* Otherwise, sack blocks are already present. */
+	for (i = 0; i < tp->rcv_numsacks; i++)
+		tp->sackblks[i] = temp[i]; /* first copy back sack list */
+	if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) 
+		return;     /* sack list remains unchanged */
+	/* 
+	 * From here, segment just received should be (part of) the 1st sack.
+	 * Go through list, possibly coalescing sack block entries.
+	 */
+	firstsack.start = tp->rcv_laststart;
+	firstsack.end = tp->rcv_lastend;
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		sack = tp->sackblks[i];
+		if (SEQ_LT(sack.end, firstsack.start) ||
+		    SEQ_GT(sack.start, firstsack.end))
+			continue; /* no overlap */
+		if (sack.start == firstsack.start && sack.end == firstsack.end){
+			/* 
+			 * identical block; delete it here since we will
+			 * move it to the front of the list.
+			 */
+			tp->sackblks[i].start = tp->sackblks[i].end = 0;
+			lastpos = i;    /* last posn with a zero entry */
+			continue;
+		}
+		if (SEQ_LEQ(sack.start, firstsack.start))
+			firstsack.start = sack.start; /* merge blocks */
+		if (SEQ_GEQ(sack.end, firstsack.end))
+			firstsack.end = sack.end;     /* merge blocks */
+		tp->sackblks[i].start = tp->sackblks[i].end = 0;
+		lastpos = i;    /* last posn with a zero entry */
+	}
+	if (lastpos != -1) {    /* at least one merge */
+		for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
+			sack = tp->sackblks[i];
+			if (sack.start == 0 && sack.end == 0)
+				continue;
+			temp[j++] = sack;
+		}
+		tp->rcv_numsacks = j; /* including first blk (added later) */
+		for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
+			tp->sackblks[i] = temp[i];
+	} else {        /* no merges -- shift sacks by 1 */
+		if (tp->rcv_numsacks < MAX_SACK_BLKS)
+			tp->rcv_numsacks++;
+		for (i = tp->rcv_numsacks-1; i > 0; i--)
+			tp->sackblks[i] = tp->sackblks[i-1];
+	}
+	tp->sackblks[0] = firstsack;
+	return;
+}  
+
+/*
+ * Process the TCP SACK option.  Returns 1 if tcp_dooptions() should continue,
+ * and 0 otherwise, if the option was fine.  tp->snd_holes is an ordered list
+ * of holes (oldest to newest, in terms of the sequence space).  
+ */             
+int
+tcp_sack_option(tp, th, cp, optlen)
+	struct tcpcb *tp;
+	struct tcphdr *th;
+	u_char *cp;
+	int    optlen;
+{       
+	int tmp_olen;
+	u_char *tmp_cp;
+	struct sackhole *cur, *p, *temp;
+   
+	if (tp->sack_disable)
+		return 1;
+           
+	/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
+	if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
+		return 1;
+	tmp_cp = cp + 2;
+	tmp_olen = optlen - 2;
+	if (tp->snd_numholes < 0)
+		tp->snd_numholes = 0;
+	if (tp->t_maxseg == 0)
+		panic("tcp_sack_option"); /* Should never happen */
+	while (tmp_olen > 0) {
+		struct sackblk sack;
+            
+		bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
+		NTOHL(sack.start); 
+		bcopy((char *) tmp_cp + sizeof(tcp_seq),
+		    (char *) &(sack.end), sizeof(tcp_seq));
+		NTOHL(sack.end);
+		tmp_olen -= TCPOLEN_SACK;
+		tmp_cp += TCPOLEN_SACK;
+		if (SEQ_LEQ(sack.end, sack.start))
+			continue; /* bad SACK fields */
+		if (SEQ_LEQ(sack.end, tp->snd_una)) 
+			continue; /* old block */
+		/* Updates snd_fack.  */
+		if (SEQ_GEQ(sack.end, tp->snd_fack))
+			tp->snd_fack = sack.end;
+		if (SEQ_GT(th->th_ack, tp->snd_una)) {
+			if (SEQ_LT(sack.start, th->th_ack))
+				continue;
+		} else {
+			if (SEQ_LT(sack.start, tp->snd_una))
+				continue;
+		}
+		if (SEQ_GT(sack.end, tp->snd_max))
+			continue;
+		if (tp->snd_holes == 0) { /* first hole */
+			tp->snd_holes = (struct sackhole *)
+			    _MALLOC(sizeof(struct sackhole), M_PCB, M_NOWAIT);
+			if (tp->snd_holes == NULL) {
+				/* ENOBUFS, so ignore SACKed block for now*/
+				continue;  
+			}
+			cur = tp->snd_holes;
+			cur->start = th->th_ack;
+			cur->end = sack.start;
+			cur->rxmit = cur->start;
+			cur->next = 0;
+			tp->snd_numholes = 1;
+			tp->rcv_lastsack = sack.end;
+			/* 
+			 * dups is at least one.  If more data has been 
+			 * SACKed, it can be greater than one.
+			 */
+			cur->dups = min(tcprexmtthresh, 
+			    ((sack.end - cur->end)/tp->t_maxseg));
+			if (cur->dups < 1)
+				cur->dups = 1;
+			continue; /* with next sack block */
+		}
+		/* Go thru list of holes:  p = previous,  cur = current */
+		p = cur = tp->snd_holes;
+		while (cur) {
+			if (SEQ_LEQ(sack.end, cur->start)) 
+				/* SACKs data before the current hole */ 
+				break; /* no use going through more holes */
+			if (SEQ_GEQ(sack.start, cur->end)) {
+				/* SACKs data beyond the current hole */
+				cur->dups++;
+				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+					tcprexmtthresh)
+					cur->dups = tcprexmtthresh;
+				p = cur;
+				cur = cur->next;
+				continue;
+			}
+			if (SEQ_LEQ(sack.start, cur->start)) {
+				/* Data acks at least the beginning of hole */
+				if (SEQ_GT(sack.end, cur->rxmit))
+					tp->retran_data -= 
+				    	    tcp_seq_subtract(cur->rxmit, 
+					    cur->start);
+				else
+					tp->retran_data -=
+					    tcp_seq_subtract(sack.end, 
+					    cur->start);
+				if (SEQ_GEQ(sack.end,cur->end)){
+					/* Acks entire hole, so delete hole */
+					if (p != cur) {
+						p->next = cur->next;
+						FREE(cur, M_PCB);
+						cur = p->next;
+					} else {
+						cur=cur->next;
+						FREE(p, M_PCB);
+						p = cur;
+						tp->snd_holes = p;
+					}
+					tp->snd_numholes--;
+					continue;
+				}
+				/* otherwise, move start of hole forward */
+				cur->start = sack.end;
+				cur->rxmit = max (cur->rxmit, cur->start);
+				p = cur;
+				cur = cur->next;
+				continue;
+			}
+			/* move end of hole backward */
+			if (SEQ_GEQ(sack.end, cur->end)) {
+				if (SEQ_GT(cur->rxmit, sack.start)) 
+					tp->retran_data -= 
+					    tcp_seq_subtract(cur->rxmit, 
+					    sack.start);
+				cur->end = sack.start;
+				cur->rxmit = min (cur->rxmit, cur->end);
+				cur->dups++;
+				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+					tcprexmtthresh)
+					cur->dups = tcprexmtthresh;
+				p = cur;
+				cur = cur->next;
+				continue;
+			}
+			if (SEQ_LT(cur->start, sack.start) &&
+			    SEQ_GT(cur->end, sack.end)) {
+				/* 
+				 * ACKs some data in middle of a hole; need to 
+				 * split current hole
+				 */
+				temp = (struct sackhole *)_MALLOC(sizeof(*temp),
+				    M_PCB,M_NOWAIT);
+				if (temp == NULL) 
+					continue; /* ENOBUFS */
+				if (SEQ_GT(cur->rxmit, sack.end)) 
+					tp->retran_data -= 
+					    tcp_seq_subtract(sack.end, 
+					    sack.start);
+				else if (SEQ_GT(cur->rxmit, sack.start))
+					tp->retran_data -= 
+					    tcp_seq_subtract(cur->rxmit, 
+					    sack.start);
+				temp->next = cur->next;
+				temp->start = sack.end;
+				temp->end = cur->end;
+				temp->dups = cur->dups;
+				temp->rxmit = max (cur->rxmit, temp->start);
+				cur->end = sack.start;
+				cur->rxmit = min (cur->rxmit, cur->end);
+				cur->dups++;
+				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+					tcprexmtthresh)
+					cur->dups = tcprexmtthresh;
+				cur->next = temp;
+				p = temp;
+				cur = p->next;
+				tp->snd_numholes++;
+			}
+		}
+		/* At this point, p points to the last hole on the list */
+		if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
+			/*
+			 * Need to append new hole at end.
+			 * Last hole is p (and it's not NULL).
+			 */
+			temp = (struct sackhole *) _MALLOC(sizeof(*temp),
+			    M_PCB, M_NOWAIT);
+			if (temp == NULL) 
+				continue; /* ENOBUFS */
+			temp->start = tp->rcv_lastsack;
+			temp->end = sack.start;
+			temp->dups = min(tcprexmtthresh, 
+			    ((sack.end - sack.start)/tp->t_maxseg));
+			if (temp->dups < 1)
+				temp->dups = 1;
+			temp->rxmit = temp->start;
+			temp->next = 0;
+			p->next = temp;
+			tp->rcv_lastsack = sack.end;
+			tp->snd_numholes++;
+		}
+	}
+	/* 
+	 * Update retran_data and snd_awnd.  Go through the list of 
+	 * holes.   Increment retran_data by (hole->rxmit - hole->start).
+	 */
+	tp->retran_data = 0;
+	cur = tp->snd_holes;
+	while (cur) {
+		tp->retran_data += cur->rxmit - cur->start;
+		cur = cur->next;
+	}
+	tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 
+	    tp->retran_data;
+
+	return 0;
+}   
+
+/*
+ * Delete stale (i.e, cumulatively ack'd) holes.  Hole is deleted only if
+ * it is completely acked; otherwise, tcp_sack_option(), called from 
+ * tcp_dooptions(), will fix up the hole.
+ */
+void
+tcp_del_sackholes(tp, th)
+	struct tcpcb *tp;
+	struct tcphdr *th;
+{
+	if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) {
+		/* max because this could be an older ack just arrived */
+		tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
+			th->th_ack : tp->snd_una;
+		struct sackhole *cur = tp->snd_holes;
+		struct sackhole *prev = cur;
+		while (cur)
+			if (SEQ_LEQ(cur->end, lastack)) {
+				cur = cur->next;
+				FREE(prev, M_PCB);
+				prev = cur;
+				tp->snd_numholes--;
+			} else if (SEQ_LT(cur->start, lastack)) {
+				cur->start = lastack;
+				if (SEQ_LT(cur->rxmit, cur->start))
+					cur->rxmit = cur->start;
+				break;
+			} else
+				break;
+		tp->snd_holes = cur;
+	}
+}
+
+/* 
+ * Delete all receiver-side SACK information.
+ */
+void
+tcp_clean_sackreport(tp)
+	struct tcpcb *tp;
+{
+	int i;
+
+	tp->rcv_numsacks = 0;
+	for (i = 0; i < MAX_SACK_BLKS; i++)
+		tp->sackblks[i].start = tp->sackblks[i].end=0;
+
+}
+
+/* 
+ * Checks for partial ack.  If partial ack arrives, turn off retransmission
+ * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
+ * If the ack advances at least to tp->snd_last, return 0.
+ */
+int
+tcp_sack_partialack(tp, th)
+	struct tcpcb *tp;
+	struct tcphdr *th;
+{
+	if (SEQ_LT(th->th_ack, tp->snd_last)) {
+		/* Turn off retx. timer (will start again next segment) */
+              tp->t_timer[TCPT_REXMT] = 0;
+	      tp->t_rtttime = 0;
+		/* 
+		 * Partial window deflation.  This statement relies on the 
+		 * fact that tp->snd_una has not been updated yet.  In FACK
+		 * hold snd_cwnd constant during fast recovery.
+		 */
+		if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
+			tp->snd_cwnd -= th->th_ack - tp->snd_una;
+			tp->snd_cwnd += tp->t_maxseg;
+		} else
+			tp->snd_cwnd = tp->t_maxseg;
+		return 1;
+	}
+	return 0;
+}
+
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
@@ -3063,7 +3515,7 @@
 	struct tcpcb *tp;
 	struct tcphdr *th;
 {
-	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+	if (SEQ_LT(th->th_ack, tp->snd_last)) {
 		tcp_seq onxt = tp->snd_nxt;
 		u_long  ocwnd = tp->snd_cwnd;
 #ifdef __APPLE__
diff -udr 10.2.3/xnu/bsd/netinet/tcp_output.c xnu/bsd/netinet/tcp_output.c
--- 10.2.3/xnu/bsd/netinet/tcp_output.c	Thu Jan 30 23:54:30 2003
+++ xnu/bsd/netinet/tcp_output.c	Tue Feb 18 15:36:50 2003
@@ -129,6 +129,95 @@
 extern int ipsec_bypass;
 #endif
 
+extern int tcprexmtthresh;
+
+#ifdef TCP_SACK_DEBUG
+void
+tcp_print_holes(tp)
+struct tcpcb *tp;
+{
+	struct sackhole *p = tp->snd_holes;
+	if (p == 0)
+		return;
+	printf("Hole report: start--end dups rxmit\n");
+	while (p) {
+		printf("%d--%d d %d r %d\n",  p->start, p->end, p->dups,
+                    p->rxmit);
+		p = p->next;
+	}
+	printf("\n");
+}
+#endif /* TCP_SACK_DEBUG */
+
+  /*
+ * Returns pointer to a sackhole if there are any pending retransmissions;
+ * NULL otherwise.
+ */
+struct sackhole *
+tcp_sack_output(tp)
+register struct tcpcb *tp;
+{
+	struct sackhole *p;
+	if (tp->sack_disable)
+		return 0;
+	p = tp->snd_holes;
+	while (p) {
+		/* In FACK, if p->dups is less than tcprexmtthresh, but
+		 * snd_fack advances more than tcprextmtthresh * tp->t_maxseg,
+		 * tcp_input() will try fast retransmit. This forces output.
+		 */
+		if ((p->dups >= tcprexmtthresh ||
+		     tp->t_dupacks == tcprexmtthresh) &&
+		    SEQ_LT(p->rxmit, p->end)) {
+			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
+				p = p->next;
+				continue;
+			}
+#ifdef TCP_SACK_DEBUG
+			if (p)
+				tcp_print_holes(tp);
+#endif
+			return p;
+		}
+        	p = p->next;
+	}
+	return 0;
+}
+
+/*
+ * After a timeout, the SACK list may be rebuilt.  This SACK information
+ * should be used to avoid retransmitting SACKed data.  This function
+ * traverses the SACK list to see if snd_nxt should be moved forward.
+ */
+void
+tcp_sack_adjust(tp)
+	struct tcpcb *tp;
+{
+	struct sackhole *cur = tp->snd_holes;
+	if (cur == 0)
+		return; /* No holes */
+	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
+		return; /* We're already beyond any SACKed blocks */
+	/* 
+	 * Two cases for which we want to advance snd_nxt:  
+	 * i) snd_nxt lies between end of one hole and beginning of another
+	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
+	 */
+	while (cur->next) {
+		if (SEQ_LT(tp->snd_nxt, cur->end))
+			return;
+		if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) 
+			cur = cur->next;
+		else {
+			tp->snd_nxt = cur->next->start;
+			return;
+		}
+	}
+	if (SEQ_LT(tp->snd_nxt, cur->end))
+		return;
+	tp->snd_nxt = tp->rcv_lastsack;
+	return;
+}
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
@@ -159,6 +248,8 @@
 	int    m_off;
 	struct mbuf *m_last = 0;
 	struct mbuf *m_head = 0;
+	int i, sack_rxmit = 0;
+	struct sackhole *p;
 
 
 	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
@@ -217,8 +308,13 @@
 	}
 again:
 	sendalot = 0;
+	if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+		tcp_sack_adjust(tp);
 	off = tp->snd_nxt - tp->snd_una;
-	win = min(tp->snd_wnd, tp->snd_cwnd);
+	if (!tp->sack_disable && (tp->t_dupacks > tcprexmtthresh))
+		win = tp->snd_wnd;
+	else
+		win = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
@@ -230,6 +326,16 @@
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
+	if (!tp->sack_disable && !sendalot) {
+		if (tp->t_dupacks >= tcprexmtthresh &&
+				(p = tcp_sack_output(tp))) {
+			off = p->rxmit - tp->snd_una;
+			sack_rxmit = 1;
+			len = min(tp->t_maxseg, p->end - p->rxmit);
+		}
+	}
+	sendalot = 0;
+
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
@@ -263,7 +369,12 @@
 		}
 	}
 
-	len = (long)ulmin(so->so_snd.sb_cc, win) - off;
+	if (!sack_rxmit) {
+
+		len = (long)ulmin(so->so_snd.sb_cc, win) - off;
+		if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && (tp->snd_awnd >= tp->snd_cwnd))
+			len = 0;
+	}
 
 	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
 		taop = &tao_noncached;
@@ -351,6 +462,8 @@
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 			goto send;
+		if (sack_rxmit)
+			goto send;
 	}
 
 	/*
@@ -394,6 +507,11 @@
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 
+	if (SEQ_GT(tp->snd_max, tp->snd_una) && !tp->t_timer[TCPT_REXMT] &&
+			!tp->t_timer[TCPT_PERSIST]) {
+		tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
+	}
+
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
@@ -455,6 +573,13 @@
 			(void)memcpy(opt + 2, &mss, sizeof(mss));
 			optlen = TCPOLEN_MAXSEG;
 
+			if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
+					(tp->t_flags & TF_SACK_PERMIT))) {
+				*((u_int32_t *) (opt + optlen)) = 
+				htonl(TCPOPT_SACK_PERMIT_HDR);	
+				optlen += 4;
+			}
+
 			if ((tp->t_flags & TF_REQ_SCALE) &&
 			    ((flags & TH_ACK) == 0 ||
 			    (tp->t_flags & TF_RCVD_SCALE))) {
@@ -486,6 +611,28 @@
  		optlen += TCPOLEN_TSTAMP_APPA;
  	}
 
+	if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
+		(tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+		tp->rcv_numsacks) {
+
+		u_int32_t *lp = (u_int32_t *)(opt + optlen);
+		u_int32_t *olp = lp++;
+		int count = 0;  /* actual number of SACKs inserted */
+		int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK;
+
+		maxsack = min(maxsack, TCP_MAX_SACK);
+		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+			struct sackblk sack = tp->sackblks[i];
+			if (sack.start == 0 && sack.end == 0)
+				continue;
+			*lp++ = htonl(sack.start);
+			*lp++ = htonl(sack.end);
+			count++;
+		}
+		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+	}
+
  	/*
 	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
 	 * options are allowed (!TF_NOOPT) and it's not a RST.
@@ -781,6 +928,13 @@
 		th->th_seq = htonl(tp->snd_nxt);
 	else
 		th->th_seq = htonl(tp->snd_max);
+	if (sack_rxmit) {
+		if (sendalot)
+			sendalot = 0;
+		th->th_seq = htonl(p->rxmit);
+		p->rxmit += len;
+		tp->retran_data += len;
+	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
@@ -855,6 +1009,11 @@
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
+		if (!tp->sack_disable) {
+			if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
+				goto timer;
+			}
+		}
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
@@ -877,6 +1036,17 @@
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
+timer:
+		if (!tp->sack_disable && sack_rxmit &&
+		    tp->t_timer[TCPT_REXMT] == 0 &&
+		    tp->snd_nxt != tp->snd_una) {
+			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
+			if (tp->t_timer[TCPT_PERSIST]) {
+				tp->t_timer[TCPT_PERSIST] = 0;
+				tp->t_rxtshift = 0;
+			}
+		}
+		/* This seems to duplicate the above...  --bbraun */
 		if (tp->t_timer[TCPT_REXMT] == 0 &&
 		    tp->snd_nxt != tp->snd_una) {
 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
@@ -987,6 +1157,8 @@
 	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
 	    (so->so_options & SO_DONTROUTE), 0);
     }
+	tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
+		tp->retran_data;
 	if (error) {
 
 		/*
diff -udr 10.2.3/xnu/bsd/netinet/tcp_subr.c xnu/bsd/netinet/tcp_subr.c
--- 10.2.3/xnu/bsd/netinet/tcp_subr.c	Thu Jan 30 23:54:30 2003
+++ xnu/bsd/netinet/tcp_subr.c	Tue Feb 18 13:37:51 2003
@@ -168,6 +168,10 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
+static int	tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_SACK, sack, CTLFLAG_RW, &tcp_do_sack, 0,
+	"Experimental Sack");
+
 static void	tcp_cleartaocache __P((void));
 static void	tcp_notify __P((struct inpcb *, int));
 
@@ -608,6 +612,8 @@
 	callout_init(tp->tt_delack = &it->inp_tp_delack);
 #endif
 
+	tp->sack_disable = tcp_do_sack ? 0 : 1;
+
 	if (tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_rfc1644)
@@ -683,6 +689,7 @@
 	register struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
+	struct sackhole *p, *q_sack;
 #if INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
@@ -831,6 +838,13 @@
     no_valid_rt:
 	/* free the reassembly queue, if any */
 	(void) tcp_freeq(tp);
+
+	q_sack = p = tp->snd_holes;
+	while (p != 0) {
+		q_sack = p->next;
+		FREE(p, M_PCB);
+		p = q_sack;
+	}
 
 #ifdef __APPLE__
 	if (so->cached_in_sock_layer)
diff -udr 10.2.3/xnu/bsd/netinet/tcp_timer.c xnu/bsd/netinet/tcp_timer.c
--- 10.2.3/xnu/bsd/netinet/tcp_timer.c	Thu Jan 30 23:54:31 2003
+++ xnu/bsd/netinet/tcp_timer.c	Tue Feb 18 15:37:31 2003
@@ -59,6 +59,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
@@ -249,6 +250,7 @@
 		splx(s);
 		return;
 	}
+
 	/*
 	 * Search through tcb's and update active timers.
 	 */
@@ -356,6 +358,7 @@
 	register int rexmt;
 	struct socket *so_tmp;
 	struct tcptemp *t_template;
+	struct sackhole *p, *q;
 
 #if INET6
 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
@@ -371,6 +374,16 @@
 	 * control block.  Otherwise, check again in a bit.
 	 */
 	case TCPT_2MSL:
+		q = p = tp->snd_holes;
+		while (p != 0) {
+			q = p->next;
+			FREE(p, M_PCB);
+			p = q;
+		}
+		tp->snd_holes = 0;
+		tp->snd_fack = tp->snd_una;
+		tp->retran_data = 0;
+		tp->snd_awnd = 0;
 		if (tp->t_state != TCPS_TIME_WAIT &&
 		    tp->t_rcvtime <= tcp_maxidle) {
 			tp->t_timer[TCPT_2MSL] = tcp_keepintvl;
@@ -386,6 +399,16 @@
 	 * to a longer retransmit interval and retransmit one segment.
 	 */
 	case TCPT_REXMT:
+		q = p = tp->snd_holes;
+		while (p != 0) {
+			q = p->next;
+			FREE(p, M_PCB);
+			p = q;
+		}
+		tp->snd_holes = 0;
+		tp->snd_fack = tp->snd_una;
+		tp->retran_data = 0;
+		tp->snd_awnd = 0;
 		if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 			tp->t_rxtshift = TCP_MAXRXTSHIFT;
 			tcpstat.tcps_timeoutdrop++;
@@ -451,7 +474,7 @@
 		 * Note:  We overload snd_recover to function also as the
 		 * snd_last variable described in RFC 2582
 		 */
-		tp->snd_recover = tp->snd_max;
+		tp->snd_last = tp->snd_max;
 		/*
 		 * Force a segment to be sent.
 		 */
diff -udr 10.2.3/xnu/bsd/netinet/tcp_usrreq.c xnu/bsd/netinet/tcp_usrreq.c
--- 10.2.3/xnu/bsd/netinet/tcp_usrreq.c	Thu Jan 30 23:54:31 2003
+++ xnu/bsd/netinet/tcp_usrreq.c	Tue Feb 18 11:22:23 2003
@@ -799,6 +799,11 @@
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
+	tp->snd_last = tp->snd_una;
+	tp->snd_fack = tp->snd_una;
+	tp->retran_data = 0;
+	tp->snd_awnd = 0;
+
 	/*
 	 * Generate a CC value for this connection and
 	 * check whether CC or CCnew should be used.
@@ -1009,6 +1014,9 @@
 			break;
 		case TCP_NOPUSH:
 			optval = tp->t_flags & TF_NOPUSH;
+			break;
+		case TCP_SACK_DISABLE:
+			optval = tp->sack_disable;
 			break;
 		default:
 			error = ENOPROTOOPT;
diff -udr 10.2.3/xnu/bsd/netinet/tcp_var.h xnu/bsd/netinet/tcp_var.h
--- 10.2.3/xnu/bsd/netinet/tcp_var.h	Thu Jan 30 23:54:31 2003
+++ xnu/bsd/netinet/tcp_var.h	Tue Feb 18 13:53:07 2003
@@ -63,6 +63,19 @@
 
 #define N_TIME_WAIT_SLOTS   128                /* must be power of 2 */
 
+struct sackblk {
+	tcp_seq start;		/* start seq no. of sack block */
+	tcp_seq end;		/* end seq no. */
+};
+
+struct sackhole {
+	tcp_seq start;		/* start seq no. of hole */
+	tcp_seq end;		/* end seq no. */
+	int     dups;		/* number of dup(s)acks for this hole */
+	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
+	struct sackhole *next;	/* next in list */
+};
+
 /*
  * Kernel variables for tcp.
  */
@@ -147,6 +160,20 @@
 	u_long	rcv_wnd;		/* receive window */
 	tcp_seq	rcv_up;			/* receive urgent pointer */
 
+	int	sack_disable;		/* disable SACK for this connection */
+	int	snd_numholes;		/* number of holes seen by sender */
+	struct sackhole *snd_holes;	/* linked list of holes (sorted) */
+	tcp_seq	snd_fack;		/* for FACK congestion control */
+	u_long	snd_awnd;		/* snd_nxt - snd_fack + */
+					/* retransmitted data */
+	int	retran_data;		/* amount of outstanding retx. data */
+	tcp_seq	snd_last;		/* for use in fast recovery */
+	tcp_seq	rcv_laststart;		/* start of last segment recd. */
+	tcp_seq	rcv_lastend;		/* end of ... */
+	tcp_seq	rcv_lastsack;		/* last seq number(+1) sack'd by rcv'r*/
+	int	rcv_numsacks;		/* # distinct sack blks present */
+	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
+
 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
@@ -374,7 +401,8 @@
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
-#define	TCPCTL_MAXID		14
+#define TCPCTL_SACK		14
+#define	TCPCTL_MAXID		15
 
 #define TCPCTL_NAMES { \
 	{ 0, 0 }, \
@@ -391,6 +419,7 @@
 	{ "pcblist", CTLTYPE_STRUCT }, \
 	{ "delacktime", CTLTYPE_INT }, \
 	{ "v6mssdflt", CTLTYPE_INT }, \
+	{ "sack", CTLTYPE_INT }, \
 }
 
 #ifdef __APPLE_API_PRIVATE
@@ -403,6 +432,7 @@
 extern	struct inpcbinfo tcbinfo;
 extern	struct tcpstat tcpstat;	/* tcp statistics */
 extern	int tcp_mssdflt;	/* XXX */
+extern	int tcp_do_sack;
 extern	int tcp_delack_enabled;
 extern	int tcp_do_newreno;
 extern	int ss_fltsz;
@@ -447,6 +477,17 @@
 	 tcp_timers __P((struct tcpcb *, int));
 void	 tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *,
 			int));
+int	tcp_sack_option __P((struct tcpcb *,struct tcphdr *,u_char *,int));
+void	tcp_update_sack_list __P((struct tcpcb *tp));
+void	tcp_del_sackholes __P((struct tcpcb *, struct tcphdr *));
+void	tcp_clean_sackreport __P((struct tcpcb *tp));
+void	tcp_sack_adjust __P((struct tcpcb *tp));
+struct sackhole * 
+	tcp_sack_output __P((struct tcpcb *tp));
+int	tcp_sack_partialack __P((struct tcpcb *, struct tcphdr *));
+ 
+u_long	tcp_seq_subtract  __P((u_long, u_long ));
+
 
 extern	struct pr_usrreqs tcp_usrreqs;
 extern	u_long tcp_sendspace;
Only in xnu/: go
