--- Copyright (c) 2007, Centre for Advanced Internet Architectures
--- Swinburne University of Technology, Melbourne, Australia
--- (CRICOS number 00111D).
---
--- CAIA Modular Congestion Control Patch v0.9.1
---
--- This patch was created against the FreeBSD 7.0-BETA4 source tree
--- cvsup'd on 6th December 2007.
---
--- This software was developed by James Healy <jhealy@swin.edu.au>
--- and Lawrence Stewart <lastewart@swin.edu.au>
---
--- All rights reserved.
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions
--- are met:
--- 1. Redistributions of source code must retain the above copyright
---    notice, this list of conditions and the following disclaimer.
--- 2. Redistributions in binary form must reproduce the above copyright
---    notice, this list of conditions and the following disclaimer in the
---    documentation and/or other materials provided with the distribution.
--- 3. The names of the authors, the "Centre for Advanced Internet Architectures"
---    and "Swinburne University of Technology" may not be used to endorse
---    or promote products derived from this software without specific
---    prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS \`\`AS IS'' AND
--- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
--- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
--- ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
--- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
--- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
--- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
--- SUCH DAMAGE.
---
--- sys/netinet/tcp_cc_functions.h.orig	1970-01-01 10:00:00.000000000 +1000
+++ sys/netinet/tcp_cc_functions.h	2007-11-22 11:59:45.000000000 +1100
@@ -0,0 +1,38 @@
+#ifndef _NETINET_TCP_CC_FUNCTIONS_H_
+#define _NETINET_TCP_CC_FUNCTIONS_H_
+
+#include <sys/queue.h>
+#include <netinet/tcp_var.h>
+
+/*
+ * Global CC vars
+ */
+extern	STAILQ_HEAD(tcp_cc_head, tcp_cc_functions) tcp_cc_list;
+extern	char tcp_cc_algorithm[];
+extern	const int tcprexmtthresh;
+extern	struct tcp_cc_functions newreno_cc_functions;
+
+/*
+ * Define the new net.inet.tcp.cc sysctl tree
+ */
+SYSCTL_DECL(_net_inet_tcp_cc);
+
+/*
+ * CC housekeeping functions
+ */
+void	tcp_cc_init(void);
+void	tcp_cc_register_algorithm(struct tcp_cc_functions *add_cc);
+void	tcp_cc_deregister_algorithm(struct tcp_cc_functions *remove_cc);
+
+/* 
+ * NewReno CC functions
+ */
+int	newreno_init(struct tcpcb *tp);
+void	newreno_cwnd_init(struct tcpcb *tp);
+void	newreno_ack_received(struct tcpcb *tp);
+void	newreno_post_fr(struct tcpcb *tp, struct tcphdr *th);
+void	newreno_after_idle(struct tcpcb *tp);
+void	newreno_after_timeout(struct tcpcb *tp);
+void	newreno_ssthresh_update(struct tcpcb *tp);
+
+#endif /* _NETINET_TCP_CC_FUNCTIONS_H_ */
--- sys/netinet/tcp_cc_functions.c.orig	1970-01-01 10:00:00.000000000 +1000
+++ sys/netinet/tcp_cc_functions.c	2007-11-27 11:33:56.000000000 +1100
@@ -0,0 +1,347 @@
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_cc_functions.h>
+
+
+// list of available cc algorithms on the current system
+struct tcp_cc_head tcp_cc_list = STAILQ_HEAD_INITIALIZER(tcp_cc_list); 
+
+MALLOC_DECLARE(M_STRING);
+MALLOC_DEFINE(M_STRING, "string", "a string");
+
+// create a struct to point to our newreno functions
+struct tcp_cc_functions newreno_cc_functions = {
+	.name = "newreno",
+	.init = newreno_init,
+	.deinit = NULL,
+	.tcp_cwnd_init = newreno_cwnd_init,
+	.tcp_ack_received = newreno_ack_received,
+	.tcp_pre_fr = newreno_ssthresh_update,
+	.tcp_post_fr = newreno_post_fr,
+	.tcp_after_idle = newreno_after_idle,
+	.tcp_after_timeout = newreno_after_timeout
+};
+
+// the system wide default cc algorithm
+char tcp_cc_algorithm[TCP_CC_MAX_ALGORITHM_NAME_LEN+1];
+
+// sysctl handler that allows the default cc algorithm for the system to be
+// viewed and changed
+static int
+tcp_cc_default_algorithm(SYSCTL_HANDLER_ARGS)
+{
+	struct tcp_cc_functions *funcs;
+
+	if (!req->newptr)
+		goto skip;
+
+	STAILQ_FOREACH(funcs, &tcp_cc_list, entries)
+	{
+		if (strncmp((char *)req->newptr, funcs->name, TCP_CC_MAX_ALGORITHM_NAME_LEN) == 0)
+			goto reorder;
+	}
+
+	return 1;
+
+reorder:
+	// Make the selected system default cc algorithm the first element in the list if it isn't already
+	if(funcs != STAILQ_FIRST(&tcp_cc_list))
+	{
+		STAILQ_REMOVE(&tcp_cc_list, funcs, tcp_cc_functions, entries);
+		STAILQ_INSERT_HEAD(&tcp_cc_list, funcs, entries);
+	}
+
+skip:
+	return sysctl_handle_string(oidp, arg1, arg2, req);
+}
+
+// sysctl handler that displays the available cc algorithms as a read 
+// only value
+static int
+tcp_cc_list_available(SYSCTL_HANDLER_ARGS)
+{
+	struct tcp_cc_functions *funcs;
+	int error = 0, pos = 0;
+	char buf[16];
+
+	STAILQ_FOREACH(funcs, &tcp_cc_list, entries)
+	{
+		if (pos == 0)
+			sprintf(buf, "%s", funcs->name);
+		else
+			sprintf(buf, ", %s", funcs->name);
+
+		error = sysctl_handle_opaque(oidp, buf, strlen(buf), req);
+		if (error)
+			return error;
+
+		pos++;
+	}
+
+	buf[0] = '\0';
+	error = sysctl_handle_opaque(oidp, buf, 1, req);
+	return error;
+}
+
+// initialise cc on system boot
+void 
+tcp_cc_init()
+{
+	// initilize list of cc algorithms
+	STAILQ_INIT(&tcp_cc_list);
+
+	// add newreno to the list of available algorithms
+	tcp_cc_register_algorithm(&newreno_cc_functions);
+
+	// set newreno to the system default
+	strncpy(tcp_cc_algorithm, newreno_cc_functions.name, sizeof(tcp_cc_algorithm));
+}
+
+void
+tcp_cc_deregister_algorithm(struct tcp_cc_functions *remove_cc)
+{
+	struct tcp_cc_functions *funcs, *tmpfuncs;
+	register struct tcpcb *tp = NULL;
+	register struct inpcb *inp = NULL;
+
+	// remove the algorithm from the list available to the system
+	STAILQ_FOREACH_SAFE(funcs, &tcp_cc_list, entries, tmpfuncs)
+	{
+		if (funcs == remove_cc)
+		{
+			// if this algorithm is the system default, reset the default to newreno
+			if (strncmp(tcp_cc_algorithm, remove_cc->name, TCP_CC_MAX_ALGORITHM_NAME_LEN) == 0)
+				snprintf(tcp_cc_algorithm,TCP_CC_MAX_ALGORITHM_NAME_LEN, "%s", newreno_cc_functions.name);
+
+			STAILQ_REMOVE(&tcp_cc_list, funcs, tcp_cc_functions, entries);
+
+			break;
+		}
+	}
+
+	// check all active control blocks and change any that are using this
+	// algorithm back to newreno. If the algorithm that was in use requires
+	// deinit code to be run, call it
+	// TODO: do we need to hold a lock while accessing the tcp control block list
+	LIST_FOREACH(inp, &tcb, inp_list)
+	{
+		tp = intotcpcb(inp);
+
+		// TODO: this if *shouldn't* be necesary. I'm checking tp just to be safe, but i've never 
+		//       seen it null here. tp->cc_functions is reguarly null. When unloading a cc 
+		//       module, there are usually a few control blocks in this list that need to
+		//       be checked, and the first one often has a null cc_functions, which is obviously
+		//       less than ideal. The control block with the null cc_functions is also in a
+		//       memory address that isn't initialised by the tcp_newtcpcb function in 
+		//       tcp_subr.c. We can't work out where it's created at this stage.
+		if (tp && tp->cc_functions)
+		{
+			if (strncmp(tp->cc_functions->name, remove_cc->name,TCP_CC_MAX_ALGORITHM_NAME_LEN) == 0 )
+			{
+				tmpfuncs = tp->cc_functions;
+				tp->cc_functions = &newreno_cc_functions;
+				if (tmpfuncs->deinit)
+				tmpfuncs->deinit(tp);
+			}
+		}
+	}
+}
+
+void
+tcp_cc_register_algorithm(struct tcp_cc_functions *add_cc)
+{
+	STAILQ_INSERT_TAIL(&tcp_cc_list, add_cc, entries);
+}
+
+/* 
+ * NEW RENO
+ */
+
+int
+newreno_init(struct tcpcb *tp)
+{
+	printf("initialising tcp connection with newreno congestion control\n");
+	return 0;
+}
+
+// update ssthresh to approx 1/2 of cwnd
+void
+newreno_ssthresh_update(struct tcpcb *tp)
+{
+	u_int win;
+
+	// reset ssthresh
+	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+
+	if (win < 2)
+		win = 2;
+
+	tp->snd_ssthresh = win * tp->t_maxseg;
+}
+
+// initial cwnd at the start of a connection
+// if there is a hostcache entry for the foreign host, base cwnd on that
+// if rfc3390 is enabled, set cwnd to approx 4 MSS as recommended
+// otherwise use the sysctl variables configured by the administrator
+void
+newreno_cwnd_init(struct tcpcb *tp)
+{
+	struct hc_metrics_lite metrics;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+
+	/*
+	 * Set the slow-start flight size depending on whether this
+	 * is a local network or not.
+	 *
+	 * Extend this so we cache the cwnd too and retrieve it here.
+	 * Make cwnd even bigger than RFC3390 suggests but only if we
+	 * have previous experience with the remote host. Be careful
+	 * not make cwnd bigger than remote receive window or our own
+	 * send socket buffer. Maybe put some additional upper bound
+	 * on the retrieved cwnd. Should do incremental updates to
+	 * hostcache when cwnd collapses so next connection doesn't
+	 * overloads the path again.
+	 *
+	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+	 * We currently check only in syncache_socket for that.
+	 */
+
+	tcp_hc_get(&inp->inp_inc, &metrics);
+
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+	if (metrics.rmx_cwnd)
+		tp->snd_cwnd = max(tp->t_maxseg,
+				min(metrics.rmx_cwnd / 2,
+				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+	else
+#endif
+	if (tcp_do_rfc3390)
+		tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+		 (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+	else if (in_localaddr(inp->inp_faddr))
+#endif
+		tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
+	else
+		tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
+}
+
+// increase cwnd on receipt of a successful ACK
+// if cwnd <= ssthresh, increases by 1 MSS per ACK
+// if cwnd > ssthresh, increase by ~1 MSS per RTT
+void
+newreno_ack_received(struct tcpcb *tp)
+{
+	u_int cw = tp->snd_cwnd;
+	u_int incr = tp->t_maxseg;
+
+	if (cw > tp->snd_ssthresh)
+		incr = incr * incr / cw;
+
+	tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+}
+
+// decrease the cwnd in response to packet loss or a transmit timeout.
+// th can be null, in which case cwnd will be set according to reno instead
+// of new reno.
+void 
+newreno_post_fr(struct tcpcb *tp, struct tcphdr *th)
+{
+	/*
+	* Out of fast recovery.
+	* Window inflation should have left us
+	* with approximately snd_ssthresh
+	* outstanding data.
+	* But in case we would be inclined to
+	* send a burst, better to do it via
+	* the slow start mechanism.
+	*/
+	if (th && SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
+		tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg;
+	else
+		tp->snd_cwnd = tp->snd_ssthresh;
+}
+
+// if a connection has been idle for a while and more data is ready to be sent,
+// reset cwnd
+void
+newreno_after_idle(struct tcpcb *tp)
+{
+	/*
+	* We have been idle for "a while" and no acks are
+	* expected to clock out any data we send --
+	* slow start to get ack "clock" running again.
+	*
+	* Set the slow-start flight size depending on whether
+	* this is a local network or not.
+	*
+	* Set the slow-start flight size depending on whether
+	* this is a local network or not.
+	*/
+	int ss = ss_fltsz;
+
+#ifdef INET6
+	if (isipv6) {
+		if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
+			ss = ss_fltsz_local;
+	} else
+#endif /* INET6 */
+
+	if (in_localaddr(tp->t_inpcb->inp_faddr))
+		ss = ss_fltsz_local;
+
+	tp->snd_cwnd = tp->t_maxseg * ss;
+}
+
+// reset the cwnd after a transmission timeout. 
+void 
+newreno_after_timeout(struct tcpcb *tp)
+{
+	newreno_ssthresh_update(tp);
+
+	/*
+	 * Close the congestion window down to one segment
+	 * (we'll open it by one segment for each ack we get).
+	 * Since we probably have a window's worth of unacked
+	 * data accumulated, this "slow start" keeps us from
+	 * dumping all that data as back-to-back packets (which
+	 * might overwhelm an intermediate gateway).
+	 *
+	 * There are two phases to the opening: Initially we
+	 * open by one mss on each ack.  This makes the window
+	 * size increase exponentially with time.  If the
+	 * window is larger than the path can handle, this
+	 * exponential growth results in dropped packet(s)
+	 * almost immediately.  To get more time between
+	 * drops but still "push" the network to take advantage
+	 * of improving conditions, we switch from exponential
+	 * to linear window opening at some threshhold size.
+	 * For a threshhold, we use half the current window
+	 * size, truncated to a multiple of the mss.
+	 *
+	 * (the minimum cwnd that will give us exponential
+	 * growth is 2 mss.  We don't allow the threshhold
+	 * to go below this.)
+	 */
+	tp->snd_cwnd = tp->t_maxseg;
+}
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, "TCP congestion control related settings");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, &tcp_cc_algorithm, sizeof(tcp_cc_algorithm), tcp_cc_default_algorithm, "A", "default tcp congestion algorithm");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, tcp_cc_list_available, "A", "list available tcp congestion algorithms");
--- sys/conf/files.orig	2007-12-07 16:51:38.000000000 +1100
+++ sys/conf/files	2007-12-07 16:18:28.000000000 +1100
@@ -1880,6 +1880,7 @@
 netinet/sctp_usrreq.c		optional inet inet6 sctp
 netinet/sctputil.c		optional inet inet6 sctp
 netinet/tcp_debug.c		optional tcpdebug
+netinet/tcp_cc_functions.c	optional inet
 netinet/tcp_hostcache.c		optional inet
 netinet/tcp_input.c		optional inet
 netinet/tcp_output.c		optional inet
--- sys/netinet/tcp_input.c.orig	2007-12-07 16:43:36.000000000 +1100
+++ sys/netinet/tcp_input.c	2007-11-22 11:59:45.000000000 +1100
@@ -96,7 +96,7 @@
 
 #include <security/mac/mac_framework.h>
 
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
 
 struct	tcpstat tcpstat;
 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
@@ -123,7 +123,7 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
     &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
 
-static int tcp_do_rfc3390 = 1;
+int tcp_do_rfc3390 = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     &tcp_do_rfc3390, 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
@@ -1000,14 +1000,9 @@
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!tcp_do_newreno &&
-			      !(tp->t_flags & TF_SACK_PERMIT) &&
-			      tp->t_dupacks < tcprexmtthresh) ||
-			     ((tcp_do_newreno ||
-			       (tp->t_flags & TF_SACK_PERMIT)) &&
-			      !IN_FASTRECOVERY(tp) &&
-			      (to.to_flags & TOF_SACK) == 0 &&
-			      TAILQ_EMPTY(&tp->snd_holes)))) {
+			    !IN_FASTRECOVERY(tp) &&
+			    (to.to_flags & TOF_SACK) == 0 &&
+			    TAILQ_EMPTY(&tp->snd_holes)) {
 				KASSERT(headlocked,
 				    ("%s: headlocked", __func__));
 				INP_INFO_WUNLOCK(&tcbinfo);
@@ -1759,13 +1754,14 @@
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 */
+
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
+
 				else if (++tp->t_dupacks > tcprexmtthresh ||
-				    ((tcp_do_newreno ||
-				      (tp->t_flags & TF_SACK_PERMIT)) &&
-				     IN_FASTRECOVERY(tp))) {
+            IN_FASTRECOVERY(tp)) {
+
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp)) {
 						int awnd;
@@ -1789,7 +1785,6 @@
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
-					u_int win;
 
 					/*
 					 * If we're doing sack, check to
@@ -1803,22 +1798,26 @@
 							tp->t_dupacks = 0;
 							break;
 						}
-					} else if (tcp_do_newreno) {
+					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
-					win = min(tp->snd_wnd, tp->snd_cwnd) /
-					    2 / tp->t_maxseg;
-					if (win < 2)
-						win = 2;
-					tp->snd_ssthresh = win * tp->t_maxseg;
+
+					// If the current tcp cc module has defined a hook
+					// for tasks to run before entering FR, call it
+					if (tp->cc_functions->tcp_pre_fr)
+						tp->cc_functions->tcp_pre_fr(tp);
+
 					ENTER_FASTRECOVERY(tp);
 					tp->snd_recover = tp->snd_max;
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
+
+					// if SACK is enabled, set some variables in the control block,
+					// send the lost packet and then finish processing this packet
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						tcpstat.tcps_sack_recovery_episode++;
 						tp->sack_newdata = tp->snd_nxt;
@@ -1826,18 +1825,23 @@
 						(void) tcp_output(tp);
 						goto drop;
 					}
+
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
-					    ("%s: tp->snd_limited too big",
-					    __func__));
+					("%s: tp->snd_limited too big",
+					__func__));
+					// set cwnd to an appropriate value as we enter fast recovery
 					tp->snd_cwnd = tp->snd_ssthresh +
-					     tp->t_maxseg *
-					     (tp->t_dupacks - tp->snd_limited);
+					tp->t_maxseg *
+					(tp->t_dupacks - tp->snd_limited);
+
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
+
 					goto drop;
+
 				} else if (tcp_do_rfc3042) {
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
@@ -1880,38 +1884,17 @@
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
-			if (IN_FASTRECOVERY(tp)) {
-				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-					if (tp->t_flags & TF_SACK_PERMIT)
-						tcp_sack_partialack(tp, th);
-					else
-						tcp_newreno_partial_ack(tp, th);
-				} else {
-					/*
-					 * Out of fast recovery.
-					 * Window inflation should have left us
-					 * with approximately snd_ssthresh
-					 * outstanding data.
-					 * But in case we would be inclined to
-					 * send a burst, better to do it via
-					 * the slow start mechanism.
-					 */
-					if (SEQ_GT(th->th_ack +
-							tp->snd_ssthresh,
-						   tp->snd_max))
-						tp->snd_cwnd = tp->snd_max -
-								th->th_ack +
-								tp->t_maxseg;
-					else
-						tp->snd_cwnd = tp->snd_ssthresh;
-				}
-			}
-		} else {
-			if (tp->t_dupacks >= tcprexmtthresh &&
-			    tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd = tp->snd_ssthresh;
-		}
+    if (IN_FASTRECOVERY(tp)) {
+      if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+        if (tp->t_flags & TF_SACK_PERMIT)
+          tcp_sack_partialack(tp, th);
+        else
+          tcp_newreno_partial_ack(tp, th);
+      } else {
+        if (tp->cc_functions->tcp_post_fr)
+          tp->cc_functions->tcp_post_fr(tp, th);
+      }
+    }
 		tp->t_dupacks = 0;
 		/*
 		 * If we reach this point, ACK is not a duplicate,
@@ -2014,13 +1997,9 @@
 		 * Otherwise open linearly: maxseg per window
 		 * (maxseg^2 / cwnd per packet).
 		 */
-		if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
-		    !IN_FASTRECOVERY(tp)) {
-			u_int cw = tp->snd_cwnd;
-			u_int incr = tp->t_maxseg;
-			if (cw > tp->snd_ssthresh)
-				incr = incr * incr / cw;
-			tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+		if (!IN_FASTRECOVERY(tp)) {
+			if (tp->cc_functions->tcp_ack_received)
+        			tp->cc_functions->tcp_ack_received(tp);
 		}
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
@@ -2035,14 +2014,11 @@
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		/* Detect una wraparound. */
-		if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    !IN_FASTRECOVERY(tp) &&
+		if (!IN_FASTRECOVERY(tp) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
-		if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    IN_FASTRECOVERY(tp) &&
-		    SEQ_GEQ(th->th_ack, tp->snd_recover))
+		if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover))
 			EXIT_FASTRECOVERY(tp);
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
@@ -2909,41 +2885,11 @@
 	if (metrics.rmx_bandwidth)
 		tp->snd_bandwidth = metrics.rmx_bandwidth;
 
-	/*
-	 * Set the slow-start flight size depending on whether this
-	 * is a local network or not.
-	 *
-	 * Extend this so we cache the cwnd too and retrieve it here.
-	 * Make cwnd even bigger than RFC3390 suggests but only if we
-	 * have previous experience with the remote host. Be careful
-	 * not make cwnd bigger than remote receive window or our own
-	 * send socket buffer. Maybe put some additional upper bound
-	 * on the retrieved cwnd. Should do incremental updates to
-	 * hostcache when cwnd collapses so next connection doesn't
-	 * overloads the path again.
-	 *
-	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
-	 * We currently check only in syncache_socket for that.
-	 */
-#define TCP_METRICS_CWND
-#ifdef TCP_METRICS_CWND
-	if (metrics.rmx_cwnd)
-		tp->snd_cwnd = max(mss,
-				min(metrics.rmx_cwnd / 2,
-				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
-	else
-#endif
-	if (tcp_do_rfc3390)
-		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
-	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
-		 (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
-	else if (in_localaddr(inp->inp_faddr))
-#endif
-		tp->snd_cwnd = mss * ss_fltsz_local;
+	// set the initial cwnd value
+	if (tp->cc_functions->tcp_cwnd_init)
+		tp->cc_functions->tcp_cwnd_init(tp);
 	else
-		tp->snd_cwnd = mss * ss_fltsz;
+		tp->snd_cwnd = mss;
 
 	/* Check the interface for TSO capabilities. */
 	if (mtuflags & CSUM_TSO)
--- sys/netinet/tcp_output.c.orig	2007-12-07 16:43:23.000000000 +1100
+++ sys/netinet/tcp_output.c	2007-12-07 13:11:01.000000000 +1100
@@ -98,10 +98,6 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
 	&ss_fltsz_local, 1, "Slow start flight size for local networks");
 
-int     tcp_do_newreno = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
-	&tcp_do_newreno, 0, "Enable NewReno Algorithms");
-
 int	tcp_do_tso = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
@@ -162,24 +158,9 @@
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
-		/*
-		 * We have been idle for "a while" and no acks are
-		 * expected to clock out any data we send --
-		 * slow start to get ack "clock" running again.
-		 *
-		 * Set the slow-start flight size depending on whether
-		 * this is a local network or not.
-		 */
-		int ss = ss_fltsz;
-#ifdef INET6
-		if (isipv6) {
-			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
-				ss = ss_fltsz_local;
-		} else
-#endif /* INET6 */
-		if (in_localaddr(tp->t_inpcb->inp_faddr))
-			ss = ss_fltsz_local;
-		tp->snd_cwnd = tp->t_maxseg * ss;
+		// reset cwnd after a period of idleness
+		if (tp->cc_functions->tcp_after_idle)
+			tp->cc_functions->tcp_after_idle(tp);
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
--- sys/netinet/tcp_subr.c.orig	2007-12-07 16:43:06.000000000 +1100
+++ sys/netinet/tcp_subr.c	2007-12-07 13:11:01.000000000 +1100
@@ -84,6 +84,7 @@
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_cc_functions.h>
 #include <netinet/tcp_syncache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
@@ -267,6 +268,8 @@
 	tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
+	tcp_cc_init();
+
 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
 	LIST_INIT(&tcb);
 	tcbinfo.ipi_listhead = &tcb;
@@ -590,7 +593,20 @@
 	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
+
 	tp = &tm->tcb;
+
+	// use the current system default cc algorithm, which is always the first
+	// algorithm in tcp_cc_list
+	tp->cc_functions = STAILQ_FIRST(&tcp_cc_list);
+
+	// if the cc module fails to initialize, stop building the control block
+	if (tp->cc_functions->init(tp) > 0)
+	{
+		uma_zfree(tcpcb_zone, tp);
+		return NULL;
+	}
+
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
@@ -749,6 +765,13 @@
 		tp->t_segqlen--;
 		tcp_reass_qsize--;
 	}
+
+	/* allow the congestion control algorithm in use for this control
+	* block to clean up after itself 
+	*/
+	if (tp->cc_functions->deinit)
+		tp->cc_functions->deinit(tp);
+
 	tcp_free_sackholes(tp);
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
--- sys/netinet/tcp_syncache.c.orig	2007-12-07 16:42:13.000000000 +1100
+++ sys/netinet/tcp_syncache.c	2007-12-07 13:11:01.000000000 +1100
@@ -1184,7 +1184,7 @@
 	if (to->to_flags & TOF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 #endif
-	if (to->to_flags & TOF_SACK)
+	if (to->to_flags & TOF_SACKPERM)
 		sc->sc_flags |= SCF_SACK;
 	if (to->to_flags & TOF_MSS)
 		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
--- sys/netinet/tcp_timer.c.orig	2007-12-07 16:42:30.000000000 +1100
+++ sys/netinet/tcp_timer.c	2007-11-21 15:37:45.000000000 +1100
@@ -518,38 +518,12 @@
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
-	/*
-	 * Close the congestion window down to one segment
-	 * (we'll open it by one segment for each ack we get).
-	 * Since we probably have a window's worth of unacked
-	 * data accumulated, this "slow start" keeps us from
-	 * dumping all that data as back-to-back packets (which
-	 * might overwhelm an intermediate gateway).
-	 *
-	 * There are two phases to the opening: Initially we
-	 * open by one mss on each ack.  This makes the window
-	 * size increase exponentially with time.  If the
-	 * window is larger than the path can handle, this
-	 * exponential growth results in dropped packet(s)
-	 * almost immediately.  To get more time between
-	 * drops but still "push" the network to take advantage
-	 * of improving conditions, we switch from exponential
-	 * to linear window opening at some threshhold size.
-	 * For a threshhold, we use half the current window
-	 * size, truncated to a multiple of the mss.
-	 *
-	 * (the minimum cwnd that will give us exponential
-	 * growth is 2 mss.  We don't allow the threshhold
-	 * to go below this.)
-	 */
-	{
-		u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
-		if (win < 2)
-			win = 2;
-		tp->snd_cwnd = tp->t_maxseg;
-		tp->snd_ssthresh = win * tp->t_maxseg;
-		tp->t_dupacks = 0;
-	}
+
+	if (tp->cc_functions->tcp_after_timeout)
+		tp->cc_functions->tcp_after_timeout(tp);
+
+	tp->t_dupacks = 0;
+ 
 	EXIT_FASTRECOVERY(tp);
 	(void) tcp_output(tp);
 
--- sys/netinet/tcp_var.h.orig	2007-12-07 16:42:45.000000000 +1100
+++ sys/netinet/tcp_var.h	2007-11-22 11:59:45.000000000 +1100
@@ -206,6 +206,9 @@
 	int	t_rttlow;		/* smallest observerved RTT */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
+
+	struct tcp_cc_functions *cc_functions;	/* the functions that will manage congestion control*/
+	void	*cc_data;		/* pointer to a struct containing data required for the cc algorithm in use */
 };
 
 #define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
@@ -446,6 +449,43 @@
 };
 #endif
 
+#define TCP_CC_MAX_ALGORITHM_NAME_LEN 15
+
+/*
+ * Structure to hold function pointers to the functions responsible
+ * for congestion control. Based on similar structure in the SCTP stack
+ */
+struct tcp_cc_functions {
+	char name[TCP_CC_MAX_ALGORITHM_NAME_LEN];
+
+	// init the congestion algorithm for the specified control block
+	int (*init) (struct tcpcb *tp);
+
+	// deinit the congestion algorithm for the specified control block
+	void (*deinit) (struct tcpcb *tp);
+
+	// initilise cwnd at the start of a connection
+	void (*tcp_cwnd_init) (struct tcpcb *tp);
+
+	// called on the receipt of a valid ack
+	void (*tcp_ack_received) (struct tcpcb *tp);
+
+	// hook to perform any necesary tasks before entering FR
+	void (*tcp_pre_fr) (struct tcpcb *tp);
+
+	// called after exiting fast recovery
+	void (*tcp_post_fr) (struct tcpcb *tp, struct tcphdr *th);
+
+	// perform tasks when data transfer resumes after an idle period
+	void (*tcp_after_idle) (struct tcpcb *tp);
+
+	// perform tasks when the connection's retransmit timer expires
+	void (*tcp_after_timeout) (struct tcpcb *tp);
+
+	// list magic, feel free to ignore
+	STAILQ_ENTRY(tcp_cc_functions) entries;
+};
+
 /*
  * Names for TCP sysctl objects
  */
@@ -498,7 +538,7 @@
 extern	int tcp_mssdflt;	/* XXX */
 extern	int tcp_minmss;
 extern	int tcp_delack_enabled;
-extern	int tcp_do_newreno;
+extern	int tcp_do_rfc3390;
 extern	int path_mtu_discovery;
 extern	int ss_fltsz;
 extern	int ss_fltsz_local;