--- Copyright (c) 2007-2008,2010, Centre for Advanced Internet Architectures
--- Swinburne University of Technology, Melbourne, Australia
--- (CRICOS number 00111D).
--- Copyright (c) 2008-2010, Lawrence Stewart <lastewart@swin.edu.au>
--- All rights reserved.
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions
--- are met:
--- 1. Redistributions of source code must retain the above copyright
---    notice, this list of conditions and the following disclaimer.
--- 2. Redistributions in binary form must reproduce the above copyright
---    notice, this list of conditions and the following disclaimer in the
---    documentation and/or other materials provided with the distribution.
--- 3. The names of the authors, "Swinburne University of Technology" and the
---    "Centre for Advanced Internet Architectures" may not be used to endorse
---    or promote products derived from this software without specific
---    prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS \`\`AS IS'' AND
--- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
--- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
--- ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
--- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
--- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
--- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
--- SUCH DAMAGE.
---
--- CAIA Modular Congestion Control v0.10.0 and Khelp Framework v0.1.1 Bundle
---
--- This patch was created against revision 209905 of the FreeBSD 9-CURRENT.
---
--- To obtain the correct revision of the FreeBSD source tree that this patch
--- applies to, and store it in the local directory "/path/to/src", run:
---
--- svn co -r 209905 http://svn.freebsd.org/base/head </path/to/src>
---
--- Make sure the base system you are installing onto is already running
--- FreeBSD 9.x before continuing.
---
--- Issuing the following commands will result in a running modular congestion
--- control and Khelp framework capable system:
---
--- cd /path/to/src
--- patch -p1 < /path/to/caia_modularcc_v0.10.0_khelp_v0.1.1_bundle_9.x.r209905.patch
--- cd /path/to/src/
--- make buildworld buildkernel installkernel installworld
--- mergemaster -iF -m /path/to/src
--- reboot
---
--- The modular congestion control patch was first released in 2007 by
--- James Healy and Lawrence Stewart whilst working on the NewTCP research
--- project at Swinburne University's Centre for Advanced Internet
--- Architectures, Melbourne, Australia, which was made possible in part by a
--- grant from the Cisco University Research Program Fund at
--- Community Foundation Silicon Valley. More details are available at:
---     http://caia.swin.edu.au/urp/newtcp/
---
--- Lawrence Stewart has continued development of this work since 2008 in his
--- spare time. More recently in 2010 David Hayes has contributed to the work,
--- especially in the context of delay-based TCP congestion control.
---
--- The Khelp framework patch was first released in 2010 by Lawrence Stewart
--- whilst studying at Swinburne University's Centre for Advanced Internet
--- Architectures, Melbourne, Australia. The work is released as part
--- of the NewTCP research project. More details are available at:
---     http://caia.swin.edu.au/urp/newtcp/
---
--- Lawrence Stewart is currently the sole maintainer of both patches.
--- All contact regarding this bundle patch should be directed to him
--- via email: lastewart@swin.edu.au
---
diff -r 7159011c25ae -r 2d2f6f743238 sys/conf/files
--- a/sys/conf/files	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/conf/files	Fri Jul 23 15:02:54 2010 +1000
@@ -2097,9 +2097,11 @@
 kern/kern_fail.c		standard
 kern/kern_fork.c		standard
 kern/kern_gzio.c		optional gzio
+kern/kern_hhook.c		standard
 kern/kern_idle.c		standard
 kern/kern_intr.c		standard
 kern/kern_jail.c		standard
+kern/kern_khelp.c		standard
 kern/kern_kthread.c		standard
 kern/kern_ktr.c			optional ktr
 kern/kern_ktrace.c		standard
@@ -2560,6 +2562,8 @@
 netinet/ip_options.c		optional inet
 netinet/ip_output.c		optional inet
 netinet/raw_ip.c		optional inet
+netinet/cc/cc.c			optional inet
+netinet/cc/cc_newreno.c		optional inet
 netinet/sctp_asconf.c		optional inet sctp
 netinet/sctp_auth.c		optional inet sctp
 netinet/sctp_bsd_addr.c		optional inet sctp
diff -r 7159011c25ae -r 2d2f6f743238 sys/kern/kern_hhook.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/kern/kern_hhook.c	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,301 @@
+/*-
+ * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart,
+ * made possible in part by a grant from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/khelp.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/rmlock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+#include <net/vnet.h>
+
+
+#define	RLOCK_HHOOK_HEAD	0x01
+#define	WLOCK_HHOOK_HEAD	0x02
+
+MALLOC_DECLARE(M_HHOOK);
+MALLOC_DEFINE(M_HHOOK, "helper hook related memory", "Blah");
+
+struct hhook {
+	hhook_func_t h_func;
+	void	*h_udata;
+	struct helper *h_helper;
+        STAILQ_ENTRY(hhook) h_next;
+};
+
+typedef	STAILQ_HEAD(hhook_list, hhook) hhook_list_t;
+
+struct hhook_head {
+	int	hh_type;
+	int	hh_id;
+	int	hh_nhooks;
+	hhook_list_t	hh_hooks;
+	struct rmlock	hh_lock;
+	LIST_ENTRY(hhook_head) hh_next;
+};
+
+LIST_HEAD(hhookheadhead, hhook_head);
+VNET_DEFINE(struct hhookheadhead, hhook_head_list);
+#define	V_hhook_head_list	VNET(hhook_head_list)
+
+static struct mtx hhook_head_list_lock;
+MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock",
+    MTX_DEF);
+
+static struct	hhook_head *	get_hhook_head(int hhook_type, int hhook_id,
+    struct rm_priotracker* rmpt, int flags);
+
+
+/*
+ * Public KPI functions
+ */
+int
+register_hhook_head(int hhook_type, int hhook_id, int flags)
+{
+	struct hhook_head *hh;
+
+	HHOOK_HEAD_LIST_LOCK();
+	hh = get_hhook_head(hhook_type, hhook_id, NULL, 0);
+
+	if (hh != NULL)
+		return (EEXIST);
+
+	hh = malloc(sizeof(struct hhook_head), M_HHOOK,
+	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
+
+	if (hh == NULL)
+		return (ENOMEM);
+
+	printf("About to register hhook_head %p with type: %d and id: %d\n", hh,
+	hhook_type, hhook_id);
+
+	hh->hh_type = hhook_type;
+	hh->hh_id = hhook_id;
+	hh->hh_nhooks = 0;
+	STAILQ_INIT(&hh->hh_hooks);
+	HHOOK_HEAD_LOCK_INIT(hh);
+
+	LIST_INSERT_HEAD(&V_hhook_head_list, hh, hh_next);
+	HHOOK_HEAD_LIST_UNLOCK();
+	return (0);
+}
+
+int
+deregister_hhook_head(int hhook_type, int hhook_id)
+{
+	struct hhook_head *hh;
+	struct hhook *tmp, *tmp2;
+	int error = 0;
+
+	HHOOK_HEAD_LIST_LOCK();
+	hh = get_hhook_head(hhook_type, hhook_id, NULL, WLOCK_HHOOK_HEAD);
+
+	if (hh == NULL)
+		error = ENOENT;
+	else {
+		LIST_REMOVE(hh, hh_next);
+
+		STAILQ_FOREACH_SAFE(tmp, &hh->hh_hooks, h_next, tmp2) {
+			free(tmp, M_HHOOK);
+		}
+
+		HHOOK_HEAD_WUNLOCK(hh);
+		HHOOK_HEAD_LOCK_DESTROY(hh);
+		free(hh, M_HHOOK);
+	}
+
+	HHOOK_HEAD_LIST_UNLOCK();
+	return (error);
+}
+
+int
+register_hhook(int hhook_type, int hhook_id, struct helper *helper,
+    hhook_func_t hook, void *udata, int flags)
+{
+	struct hhook *h, *tmp;
+	struct hhook_head *hh;
+	int error = 0;
+
+	h = malloc(sizeof(struct hhook), M_HHOOK,
+	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
+
+	if (h == NULL)
+		return (ENOMEM);
+
+	h->h_helper = helper;
+	h->h_func = hook;
+	h->h_udata = udata;
+
+	hh = get_hhook_head(hhook_type, hhook_id, NULL, WLOCK_HHOOK_HEAD);
+
+	if (hh == NULL) {
+		free(h, M_HHOOK);
+		return (ENOENT);
+	}
+
+	STAILQ_FOREACH(tmp, &hh->hh_hooks, h_next) {
+		if (tmp->h_func == hook && tmp->h_udata == udata) {
+			error = EEXIST;
+			break;
+		}
+	}
+
+	if (!error) {
+		STAILQ_INSERT_TAIL(&hh->hh_hooks, h, h_next);
+		hh->hh_nhooks++;
+	}
+	else
+		free(h, M_HHOOK);
+
+	HHOOK_HEAD_WUNLOCK(hh);
+
+	return (error);
+}
+
+int
+deregister_hhook(int hhook_type, int hhook_id, hhook_func_t hook, void *udata,
+    int flags)
+{
+	struct hhook *tmp;
+	struct hhook_head *hh;
+
+	hh = get_hhook_head(hhook_type, hhook_id, NULL, WLOCK_HHOOK_HEAD);
+
+	if (hh == NULL)
+		return (ENOENT);
+
+	STAILQ_FOREACH(tmp, &hh->hh_hooks, h_next) {
+		if (tmp->h_func == hook && tmp->h_udata == udata) {
+			STAILQ_REMOVE(&hh->hh_hooks, tmp, hhook, h_next);
+			free(tmp, M_HHOOK);
+			hh->hh_nhooks--;
+			break;
+		}
+	}
+
+	HHOOK_HEAD_WUNLOCK(hh);
+	return (0);
+}
+
+void
+run_hhooks(int hhook_type, int hhook_id, void *ctx_data,
+    struct helper_dblocks *hdbs)
+{
+	struct hhook_head *hh;
+	struct hhook *tmp;
+	struct rm_priotracker rmpt;
+	int i = 0;
+	void *dblock = NULL;
+	uint32_t nblocks = hdbs->nblocks;
+
+	hh = get_hhook_head(hhook_type, hhook_id, &rmpt, RLOCK_HHOOK_HEAD);
+
+	if (hh == NULL)
+		return;
+
+	STAILQ_FOREACH(tmp, &hh->hh_hooks, h_next) {
+		//printf("Running hook %p for helper %d\n", tmp,
+		//tmp->h_helper->id);
+		if (tmp->h_helper->h_flags & HELPER_NEEDS_DBLOCK) {
+			if (nblocks == 0
+			    || i >= nblocks
+			    || tmp->h_helper->h_id != hdbs->blocks[i].hd_id)
+				continue;
+			dblock = hdbs->blocks[i].hd_block;
+			i++;
+		}
+		tmp->h_func(tmp->h_udata, ctx_data, dblock, hdbs);
+		dblock = NULL;
+	}
+
+	HHOOK_HEAD_RUNLOCK(hh, &rmpt);
+}
+
+
+/*
+ * Private KPI functions
+ */
+static struct hhook_head *
+get_hhook_head(int hhook_type, int hhook_id, struct rm_priotracker *rmpt,
+    int flags)
+{
+	struct hhook_head *tmp, *ret = NULL;
+
+	/*KASSERT(HHOOK_HEAD_LIST_LOCK_ASSERT(), ("hhook_head_list_lock not
+	 * locked"));*/
+
+	LIST_FOREACH(tmp, &V_hhook_head_list, hh_next) {
+		if (tmp->hh_type == hhook_type && tmp->hh_id == hhook_id) {
+			ret = tmp;
+			if (flags & RLOCK_HHOOK_HEAD)
+				HHOOK_HEAD_RLOCK(ret, rmpt);
+			else if (flags & WLOCK_HHOOK_HEAD)
+				HHOOK_HEAD_WLOCK(ret);
+			break;
+		}
+	}
+
+	return (ret);
+}
+
+static int
+vnet_hhook_init(const void *unused)
+{
+
+	LIST_INIT(&V_hhook_head_list);
+	return (0);
+}
+
+static int
+vnet_hhook_uninit(const void *unused)
+{
+
+	return (0);
+}
+
+#define	HHOOK_SYSINIT_ORDER	SI_SUB_PROTO_BEGIN
+#define	HHOOK_MODEVENT_ORDER	(SI_ORDER_FIRST) 
+#define	HHOOK_VNET_ORDER	(HHOOK_MODEVENT_ORDER + 2) 
+
+VNET_SYSINIT(vnet_hhook_init, HHOOK_SYSINIT_ORDER, HHOOK_VNET_ORDER,
+    vnet_hhook_init, NULL);
+ 
+VNET_SYSUNINIT(vnet_hhook_uninit, HHOOK_SYSINIT_ORDER, HHOOK_VNET_ORDER,
+    vnet_hhook_uninit, NULL);
+
diff -r 7159011c25ae -r 2d2f6f743238 sys/kern/kern_khelp.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/kern/kern_khelp.c	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart,
+ * made possible in part by a grant from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/khelp.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/module_khelp.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+static struct rwlock helper_list_lock;
+RW_SYSINIT(helperlistlock, &helper_list_lock, "helper list lock");
+
+static STAILQ_HEAD(helper_head, helper) helpers = STAILQ_HEAD_INITIALIZER(helpers);
+
+static int num_dblocks = 0;
+
+/* Monotonically increasing ID assigned to helpers on registration. */
+static int32_t helper_id = 0;
+
+static struct helper * get_helper(int32_t id);
+
+/*
+ * Public KPI functions.
+ */
+int
+init_helper_dblocks(struct helper_dblocks *hdbs)
+{
+	struct helper *h;
+	struct helper_dblock *dblock;
+	int i = 0, error = 0;
+
+	KASSERT(hdbs != NULL, ("struct helper_dblocks not initialised!"));
+
+	HELPER_LIST_RLOCK();
+
+	if (num_dblocks == 0) {
+		HELPER_LIST_RUNLOCK();
+		return (0);
+	}
+
+	/* XXXLAS: Should only allocate for helpers of the appropriate class. */
+	hdbs->blocks = malloc(num_dblocks * sizeof(struct helper_dblock), M_HELPER,
+	    M_NOWAIT | M_ZERO);
+
+	if (hdbs->blocks != NULL) {
+		/*printf("Malloced ptr %p for %d data blocks\n", hdbs->blocks,
+		    num_dblocks);*/
+		STAILQ_FOREACH(h, &helpers, h_next) {
+			if (h->h_flags & HELPER_NEEDS_DBLOCK) {
+				dblock = hdbs->blocks+i;
+				/*printf("Current dblock ptr: %p\n", dblock);*/
+				dblock->hd_block = uma_zalloc(h->h_zone,
+				    M_NOWAIT);
+				/*
+				if (dblock[i]->block == NULL) {
+					XXX: Free all previous dblocks.
+					error = ENOMEM
+					break;
+				}
+				*/
+				dblock->hd_id = h->h_id;
+				/*printf("dblock[%d]: id=%d, block=%p\n", i,
+				    dblock->hd_id, dblock->hd_block);*/
+				i++;
+				refcount_acquire(&h->h_refcount);
+			}
+		}
+		hdbs->nblocks = i;
+	} else
+		error = ENOMEM;
+
+	HELPER_LIST_RUNLOCK();
+	return (error);
+}
+
+int
+destroy_helper_dblocks(struct helper_dblocks *hdbs)
+{
+	struct helper *h;
+	int32_t nblocks = hdbs->nblocks;
+
+	HELPER_LIST_WLOCK();
+
+	for (nblocks--; nblocks >= 0; nblocks--) {
+		if ((h = get_helper(hdbs->blocks[nblocks].hd_id)) != NULL) {
+			refcount_release(&h->h_refcount);
+			/*printf("destroy() freeing hdbs->blocks[%d] with ptr %p\n",
+			    nblocks, hdbs->blocks[nblocks].hd_block);*/
+			uma_zfree(h->h_zone, hdbs->blocks[nblocks].hd_block);
+		}
+	}
+
+	HELPER_LIST_WUNLOCK();
+	free(hdbs->blocks, M_HELPER);
+	return (0);
+}
+
+int
+register_helper(struct helper *h)
+{
+	HELPER_LIST_WLOCK();
+	if (h->h_flags | HELPER_NEEDS_DBLOCK)
+		num_dblocks++;
+
+	refcount_init(&h->h_refcount, 0);
+	h->h_id = helper_id++;
+	STAILQ_INSERT_TAIL(&helpers, h, h_next);
+	HELPER_LIST_WUNLOCK();
+	printf("Registered \"%s\" helper (mem %p)\n", h->h_name, h);
+	return (0);
+}
+
+int
+deregister_helper(struct helper *h)
+{
+	int error = 0;
+
+	/*
+	HHOOK_WLOCK
+	Remove this helper's hooks
+	HHOOK_WUNLOCK
+	*/
+
+	HELPER_LIST_WLOCK();
+	if (h->h_refcount > 0)
+		error = EBUSY;
+	
+	if (!error) {
+		STAILQ_REMOVE(&helpers, h, helper, h_next);
+		if (h->h_flags | HELPER_NEEDS_DBLOCK)
+			num_dblocks--;
+		printf("Deregistered \"%s\" helper (mem %p)\n", h->h_name, h);
+	}
+	HELPER_LIST_WUNLOCK();
+	return (error);
+}
+
+int32_t
+get_helper_id(char *hname)
+{
+	struct helper *h;
+	int32_t id = -1;
+
+	HELPER_LIST_RLOCK();
+	STAILQ_FOREACH(h, &helpers, h_next) {
+		if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) {
+			id = h->h_id;
+			break;
+		}
+	}
+	HELPER_LIST_RUNLOCK();
+	return (id);
+}
+
+void *
+get_helper_dblock(struct helper_dblocks *hdbs, int32_t id)
+{
+	uint32_t nblocks = hdbs->nblocks;
+
+	for (nblocks--; nblocks >= 0; nblocks--) {
+		if (hdbs->blocks[nblocks].hd_id == id)
+			return (hdbs->blocks[nblocks].hd_block);
+	}
+	return (NULL);
+}
+
+/*
+ * Private KPI functions.
+ */
+static struct helper *
+get_helper(int32_t id)
+{
+	struct helper *h;
+
+	HELPER_LIST_LOCK_ASSERT();
+
+	STAILQ_FOREACH(h, &helpers, h_next) {
+		if (h->h_id == id)
+			return (h);
+	}
+	return (NULL);
+}
+
+/*
+ * Handles kld related events. Returns 0 on success, non-zero on failure.
+ */
+int
+helper_modevent(module_t mod, int event_type, void *data)
+{
+	int error = 0;
+	struct helper_modevent_data *hmd = (struct helper_modevent_data *)data;
+
+	switch(event_type) {
+		case MOD_LOAD:
+			if (hmd->helper->h_flags & HELPER_NEEDS_DBLOCK) {
+				if (hmd->uma_zsize <= 0) {
+					printf("Use DECLARE_HELPER_UMA() instead!\n");
+					error = EDOOFUS;
+					break;
+				}
+				hmd->helper->h_zone =
+				    uma_zcreate(hmd->name, hmd->uma_zsize,
+				    hmd->umactor, hmd->umadtor, NULL, NULL, 0,
+				    0);
+				if (hmd->helper->h_zone == NULL) {
+					error = ENOMEM;
+					break;
+				}
+			}
+			strlcpy(hmd->helper->h_name, hmd->name,
+			    HELPER_NAME_MAXLEN);
+			if (hmd->helper->mod_init != NULL)
+				error = hmd->helper->mod_init();
+			if (!error)
+				error = register_helper(hmd->helper);
+			break;
+
+		case MOD_QUIESCE:
+			error = deregister_helper(hmd->helper);
+			if (!error) {
+				uma_zdestroy(hmd->helper->h_zone);
+				if (hmd->helper->mod_destroy != NULL)
+					hmd->helper->mod_destroy();
+			} else
+				printf("Helper's refcount != 0, can't unload\n");
+			break;
+
+		case MOD_SHUTDOWN:
+		case MOD_UNLOAD:
+			break;
+
+		default:
+			error = EINVAL;
+			break;
+	}
+
+	return (error);
+}
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/cc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc.h	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 2008-2009
+ * 	Swinburne University of Technology, Melbourne, Australia
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_CC_H_
+#define _NETINET_CC_H_
+
+/* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */
+#include <netinet/tcp.h>
+
+/* Global CC vars. */
+extern	STAILQ_HEAD(cc_head, cc_algo) cc_list;
+extern	const int tcprexmtthresh;
+extern	struct cc_algo newreno_cc_algo;
+
+/* Define the new net.inet.tcp.cc sysctl tree. */
+SYSCTL_DECL(_net_inet_tcp_cc);
+
+/* CC housekeeping functions. */
+void	cc_init(void);
+int	cc_register_algo(struct cc_algo *add_cc);
+int	cc_deregister_algo(struct cc_algo *remove_cc);
+
+/*
+ * Wrapper around transport structs that contain same-named congestion control
+ * variables. Allows algos to be shared amongst multiple CC aware transprots.
+ */
+struct cc_var {
+	void *cc_data;		/* Per-connection private CC algorithm data. */
+	int abc_sentawnd;	/* Has ABC counted a cwnd's worth of bytes? */
+	int bytes_this_ack;	/* # bytes acked by the current ACK. */
+	int cwnd_limited;	/* Are we current cwnd limited? */
+	int type;		/* Indicates which ptr is valid in ccvc. */
+	union ccv_container {
+		struct tcpcb *tcp;
+		struct sctp_nets *sctp;
+	} ccvc;
+};
+
+/* ACK types passed to ack_received. */
+#define CC_ACK		0x0001 /* Regular in sequence ACK. */
+#define CC_DUPACK	0x0002 /* Duplicate ACK. */
+#define CC_PARTIALACK	0x0004 /* Not yet. */
+#define CC_SACK		0x0008 /* Not yet. */
+
+/*
+ * Congestion signal types passed to cong_signal.
+ * The highest order 8 bits (0x01000000 - 0x80000000) are reserved
+ * for CC algos to declare their own congestion signal types.
+ */
+#define CC_ECN		0x000001 /* ECN marked packet received. */
+#define CC_RTO		0x000002 /* RTO fired. */
+#define CC_RTO_ERR	0x000004 /* RTO fired in error. */
+#define CC_NDUPACK	0x000008 /* Threshold of dupack's reached. */
+
+/*
+ * Structure to hold data and function pointers that together represent
+ * a congestion control algorithm.
+ */
+struct cc_algo {
+	char name[TCP_CA_NAME_MAX];
+
+	/* Init global module state on kldload. */
+	int (*mod_init) (void);
+
+	/* Cleanup global module state on kldunload. */
+	int (*mod_destroy) (void);
+
+	/* Init CC state for a new control block. */
+	int (*cb_init) (struct cc_var *ccv);
+
+	/* Cleanup CC state for a terminating control block. */
+	void (*cb_destroy) (struct cc_var *ccv);
+
+	/* Init variables for a newly established connection. */
+	void (*conn_init) (struct cc_var *ccv);
+
+	/* Called on receipt of an ack. */
+	void (*ack_received) (struct cc_var *ccv, uint16_t type);
+
+	/* Called on detection of a congestion signal. */
+	void (*cong_signal) (struct cc_var *ccv, uint32_t type);
+
+	/* Called after exiting congestion recovery. */
+	void (*post_recovery) (struct cc_var *ccv);
+
+	/* Called when data transfer resumes after an idle period. */
+	void (*after_idle) (struct cc_var *ccv);
+
+	STAILQ_ENTRY(cc_algo) entries;
+};
+
+/* Macro to obtain the CC algo's struct ptr. */
+#define CC_ALGO(tp)	((tp)->cc_algo)
+
+/* Macro to obtain the CC algo's data ptr. */
+#define CC_DATA(tp)	((tp)->cc_data)
+
+/* Macro to obtain the system default CC algo's struct ptr. */
+#define CC_DEFAULT()	STAILQ_FIRST(&cc_list)
+
+extern struct rwlock cc_list_lock;
+#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list")
+#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock)
+#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock)
+#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock)
+#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock)
+#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock)
+#define CC_LIST_WLOCK_ASSERT() rw_assert(&cc_list_lock, RA_WLOCKED)
+
+#endif /* _NETINET_CC_H_ */
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/cc/cc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc/cc.c	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,327 @@
+/*-
+ * Copyright (c) 2007-2009
+ *	Swinburne University of Technology, Melbourne, Australia
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc_module.h>
+
+/*
+ * List of available cc algorithms on the current system. First element
+ * is used as the system default CC algorithm.
+ */
+struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
+
+/* Protects the cc_list TAILQ. */
+struct rwlock cc_list_lock;
+
+/*
+ * Set the default CC algorithm to new_default. The default is identified
+ * by being the first element in the cc_list TAILQ.
+ */
+static void
+cc_set_default(struct cc_algo *new_default)
+{
+	CC_LIST_WLOCK_ASSERT();
+
+	/*
+	 * Make the requested system default CC algorithm the first element in
+	 * the list if it isn't already.
+	 */
+	if (new_default != CC_DEFAULT()) {
+		STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries);
+		STAILQ_INSERT_HEAD(&cc_list, new_default, entries);
+	}
+}
+
+/*
+ * Sysctl handler to show and change the default CC algorithm.
+ */
+static int
+cc_default_algo(SYSCTL_HANDLER_ARGS)
+{
+	struct cc_algo *funcs;
+	int err, found;
+
+	err = found = 0;
+
+	if (req->newptr == NULL) {
+		char default_cc[TCP_CA_NAME_MAX];
+
+		/* Just print the current default. */
+		CC_LIST_RLOCK();
+		strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
+		CC_LIST_RUNLOCK();
+		err = sysctl_handle_string(oidp, default_cc, 1, req);
+	} else {
+		/* Find algo with specified name and set it to default. */
+		CC_LIST_WLOCK();
+		STAILQ_FOREACH(funcs, &cc_list, entries) {
+			if (strncmp((char *)req->newptr, funcs->name,
+			    TCP_CA_NAME_MAX) == 0) {
+				found = 1;
+				cc_set_default(funcs);
+			}
+		}
+		CC_LIST_WUNLOCK();
+
+		if (!found)
+			err = ESRCH;
+	}
+
+	return (err);
+}
+
+/*
+ * Sysctl handler to display the list of available CC algorithms.
+ */
+static int
+cc_list_available(SYSCTL_HANDLER_ARGS)
+{
+	struct cc_algo *algo;
+	struct sbuf *s;
+	int err, first;
+
+	err = 0;
+	first = 1;
+	s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND);
+
+	if (s == NULL)
+		return (ENOMEM);
+
+	CC_LIST_RLOCK();
+	STAILQ_FOREACH(algo, &cc_list, entries) {
+		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
+		if (err)
+			break;
+		first = 0;
+	}
+	CC_LIST_RUNLOCK();
+
+	if (!err) {
+		sbuf_finish(s);
+		err = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
+	}
+
+	sbuf_delete(s);
+	return (err);
+}
+
+/*
+ * Initialise CC subsystem on system boot.
+ */
+void
+cc_init()
+{
+	CC_LIST_LOCK_INIT();
+	STAILQ_INIT(&cc_list);
+}
+
+/*
+ * Returns non-zero on success, 0 on failure.
+ */
+int
+cc_deregister_algo(struct cc_algo *remove_cc)
+{
+	struct cc_algo *funcs, *tmpfuncs;
+	struct tcpcb *tp;
+	struct inpcb *inp;
+	int err;
+
+	err = EPERM;
+
+	/* Never allow newreno to be deregistered. */
+	if (&newreno_cc_algo == remove_cc)
+		return (err);
+
+	/* Remove algo from cc_list so that new connections can't use it. */
+	CC_LIST_WLOCK();
+	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
+		if (funcs == remove_cc) {
+			/*
+			 * If we're removing the current system default,
+			 * reset the default to newreno.
+			 */
+			if (strncmp(CC_DEFAULT()->name,
+			    remove_cc->name,
+			    TCP_CA_NAME_MAX) == 0)
+				cc_set_default(&newreno_cc_algo);
+
+			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+			err = 0;
+			break;
+		}
+	}
+	CC_LIST_WUNLOCK();
+
+	if (!err) {
+		/*
+		 * Check all active control blocks and change any that are
+		 * using this algorithm back to newreno. If the algorithm that
+		 * was in use requires cleanup code to be run, call it.
+		 *
+		 * New connections already part way through being initialised
+		 * with the CC algo we're removing will not race with this code
+		 * because the INP_INFO_WLOCK is held during initialisation.
+		 * We therefore don't enter the loop below until the connection
+		 * list has stabilised.
+		 */
+		INP_INFO_RLOCK(&V_tcbinfo);
+		LIST_FOREACH(inp, &V_tcb, inp_list) {
+			INP_WLOCK(inp);
+			/* Important to skip tcptw structs. */
+			if (!(inp->inp_flags & INP_TIMEWAIT) &&
+			    (tp = intotcpcb(inp)) != NULL) {
+				/*
+				 * By holding INP_WLOCK here, we are
+				 * assured that the connection is not
+				 * currently executing inside the CC
+				 * module's functions i.e. it is safe to
+				 * make the switch back to newreno.
+				 */
+				if (CC_ALGO(tp) == remove_cc) {
+					tmpfuncs = CC_ALGO(tp);
+					/* Newreno does not require any init. */
+					CC_ALGO(tp) = &newreno_cc_algo;
+					if (tmpfuncs->cb_destroy != NULL)
+						tmpfuncs->cb_destroy(tp->ccv);
+				}
+			}
+			INP_WUNLOCK(inp);
+		}
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+
+	return (err);
+}
+
+/*
+ * Returns 0 on success, non-zero on failure.
+ */
+int
+cc_register_algo(struct cc_algo *add_cc)
+{
+	struct cc_algo *funcs;
+	int err;
+
+	err = 0;
+
+	/*
+	 * Iterate over list of registered CC algorithms and make sure
+	 * we're not trying to add a duplicate.
+	 */
+	CC_LIST_WLOCK();
+	STAILQ_FOREACH(funcs, &cc_list, entries) {
+		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
+		    TCP_CA_NAME_MAX) == 0)
+			err = EEXIST;
+	}
+
+	if (!err)
+		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
+
+	CC_LIST_WUNLOCK();
+
+	return (err);
+}
+
+/*
+ * Handles kld related events. Returns 0 on success, non-zero on failure.
+ */
+int
+cc_modevent(module_t mod, int event_type, void *data)
+{
+	struct cc_algo *algo;
+	int err = 0;
+
+	algo = (struct cc_algo *)data;
+
+	switch(event_type) {
+	case MOD_LOAD:
+		if (algo->mod_init != NULL)
+			err = algo->mod_init();
+		if (!err)
+			err = cc_register_algo(algo);
+		break;
+
+	case MOD_QUIESCE:
+		err = cc_deregister_algo(algo);
+		if (!err && algo->mod_destroy != NULL)
+			algo->mod_destroy();
+		break;
+
+	case MOD_SHUTDOWN:
+	case MOD_UNLOAD:
+		/* XXX: Fail here if MOD_QUIESCE failed. */
+		break;
+
+	default:
+		err = EINVAL;
+		break;
+	}
+
+	return (err);
+}
+
+/* Declare sysctl tree and populate it. */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
+    "congestion control related settings");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+    NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, cc_list_available, "A",
+    "list available congestion control algorithms");
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/cc/cc_module.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc/cc_module.h	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, made possible
+ * in part by a grant from the Cisco University Research Program Fund at
+ * Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_CC_MODULE_H_
+#define _NETINET_CC_MODULE_H_
+
+/*
+ * Allows a CC algorithm to manipulate a commonly named CC variable regardless
+ * of the transport protocol and associated C struct.
+ * XXXLAS: Out of action until the work to support SCTP is done.
+ *
+#define	CCV(ccv, what)							\
+(*(									\
+	(ccv)->type == IPPROTO_TCP ?	&(ccv)->ccvc.tcp->what :	\
+					&(ccv)->ccvc.sctp->what		\
+))
+ */
+#define	CCV(ccv, what) (ccv)->ccvc.tcp->what
+
+#define	DECLARE_CC_MODULE(ccname, ccalgo) 				\
+	static moduledata_t cc_##ccname = {				\
+		.name = #ccname,					\
+		.evhand = cc_modevent,					\
+		.priv = ccalgo						\
+	};								\
+	DECLARE_MODULE(ccname, cc_##ccname,				\
+	    SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY)
+
+int	cc_modevent(module_t mod, int type, void *data);
+
+#endif /* _NETINET_CC_MODULE_H_ */
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/cc/cc_newreno.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/netinet/cc/cc_newreno.c	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.
+ * Copyright (c) 2007-2010
+ *	Swinburne University of Technology, Melbourne, Australia
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, James Healy and
+ * David Hayes, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc_module.h>
+
+void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
+void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
+void	newreno_post_recovery(struct cc_var *ccv);
+void	newreno_after_idle(struct cc_var *ccv);
+
+struct cc_algo newreno_cc_algo = {
+	.name = "newreno",
+	.ack_received = newreno_ack_received,
+	.cong_signal = newreno_cong_signal,
+	.post_recovery = newreno_post_recovery,
+	.after_idle = newreno_after_idle
+};
+
+/*
+ * Increase cwnd on receipt of a successful ACK:
+ * if cwnd <= ssthresh, increases by 1 MSS per ACK
+ * if cwnd > ssthresh, increase by ~1 MSS per RTT
+ */
+void
+newreno_ack_received(struct cc_var *ccv, uint16_t type)
+{
+	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
+	    ccv->cwnd_limited) {
+		u_int cw = CCV(ccv, snd_cwnd);
+		u_int incr = CCV(ccv, t_maxseg);
+
+		/*
+		 * Regular in-order ACK, open the congestion window.
+		 * Method depends on which congestion control state we're
+		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
+		 * enabled.
+		 *
+		 * slow start: cwnd <= ssthresh
+		 * cong avoid: cwnd > ssthresh
+		 *
+		 * slow start and ABC (RFC 3465):
+		 *   Grow cwnd exponentially by the amount of data
+		 *   ACKed capping the max increment per ACK to
+		 *   (abc_l_var * maxseg) bytes.
+		 *
+		 * slow start without ABC (RFC 5681):
+		 *   Grow cwnd exponentially by maxseg per ACK.
+		 *
+		 * cong avoid and ABC (RFC 3465):
+		 *   Grow cwnd linearly by maxseg per RTT for each
+		 *   cwnd worth of ACKed data.
+		 *
+		 * cong avoid without ABC (RFC 5681):
+		 *   Grow cwnd linearly by approximately maxseg per RTT using
+		 *   maxseg^2 / cwnd per ACK as the increment.
+		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
+		 *   avoid capping cwnd.
+		 */
+		if (cw > CCV(ccv, snd_ssthresh)) {
+			if (V_tcp_do_rfc3465) {
+				if (ccv->abc_sentawnd)
+					ccv->abc_sentawnd = 0;
+				else
+					incr = 0;
+			} else
+				incr = max((incr * incr / cw), 1);
+		} else if (V_tcp_do_rfc3465) {
+			/*
+			 * XXX: Need to check that we're not slowstarting after
+			 * an RTO in the above if.
+			 */
+			/*
+			 * In slow-start with ABC enabled and no RTO in sight?
+			 * (Must not use abc_l_var > 1 if slow starting after an
+			 * RTO.
+			 */
+			incr = min(ccv->bytes_this_ack, V_tcp_abc_l_var *
+			    CCV(ccv, t_maxseg));
+		}
+
+		/* ABC is on by default, so incr equals 0 frequently. */
+		if (incr > 0)
+			CCV(ccv, snd_cwnd) = min(cw + incr,
+			    TCP_MAXWIN << CCV(ccv, snd_scale));
+	}
+}
+
+/*
+ * manage congestion signals
+ */
+void
+newreno_cong_signal(struct cc_var *ccv, uint32_t type)
+{
+	u_int win;
+
+	win = max(CCV(ccv, snd_cwnd) / 2 / CCV(ccv, t_maxseg), 2) *
+	    CCV(ccv, t_maxseg);
+
+	switch(type) {
+	case CC_NDUPACK:
+		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
+				CCV(ccv, snd_ssthresh) = win;
+			ENTER_RECOVERY(CCV(ccv, t_flags));
+		}
+		break;
+	case CC_ECN:
+		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
+			CCV(ccv, snd_ssthresh) = win;
+			CCV(ccv, snd_cwnd) = win;
+			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
+		}
+		break;
+	}
+}
+
+/*
+ * decrease the cwnd in response to packet loss or a transmit timeout.
+ * th can be null, in which case cwnd will be set according to reno instead
+ * of new reno.
+ */
+void
+newreno_post_recovery(struct cc_var *ccv)
+{
+	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+		/*
+		 * Fast recovery will conclude after returning from this
+		 * function. Window inflation should have left us with
+		 * approximately snd_ssthresh outstanding data. But in case we
+		 * would be inclined to send a burst, better to do it via the
+		 * slow start mechanism.
+		 */
+		if (SEQ_GT(CCV(ccv, curack) + CCV(ccv, snd_ssthresh),
+		    CCV(ccv, snd_max)))
+			CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) -
+			    CCV(ccv, curack) + CCV(ccv, t_maxseg);
+		else
+			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
+	}
+}
+
+/*
+ * if a connection has been idle for a while and more data is ready to be sent,
+ * reset cwnd
+ */
+void
+newreno_after_idle(struct cc_var *ccv)
+{
+	/*
+	 * We have been idle for "a while" and no acks are expected to clock out
+	 * any data we send -- slow start to get ack "clock" running again.
+	 */
+	if (V_tcp_do_rfc3390)
+		CCV(ccv, snd_cwnd) = min(4 * CCV(ccv, t_maxseg),
+		    max(2 * CCV(ccv, t_maxseg), 4380));
+	else
+		CCV(ccv, snd_cwnd) = CCV(ccv, t_maxseg) * 2;
+}
+
+
+DECLARE_CC_MODULE(newreno, &newreno_cc_algo);
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_input.c	Fri Jul 23 15:02:54 2010 +1000
@@ -40,6 +40,7 @@
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
@@ -61,6 +62,7 @@
 
 #define TCPSTATES		/* for logging */
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
@@ -75,7 +77,6 @@
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
@@ -96,7 +97,7 @@
 
 #include <security/mac/mac_framework.h>
 
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
 
 VNET_DEFINE(struct tcpstat, tcpstat);
 SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
@@ -132,19 +133,16 @@
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-#define	V_tcp_do_rfc3390	VNET(tcp_do_rfc3390)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-#define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
-#define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
@@ -203,8 +201,9 @@
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline
-		 tcp_congestion_exp(struct tcpcb *);
+static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type);
+static void inline	cc_conn_init(struct tcpcb *tp);
+static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 
 /*
  * Kernel module interface for updating tcpstat.  The argument is an index
@@ -220,20 +219,180 @@
 	(*((u_long *)&V_tcpstat + statnum))++;
 }
 
+/*
+ * CC wrapper hook functions
+ */
 static void inline
-tcp_congestion_exp(struct tcpcb *tp)
+cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
-	u_int win;
-	
-	win = min(tp->snd_wnd, tp->snd_cwnd) /
-	    2 / tp->t_maxseg;
-	if (win < 2)
-		win = 2;
-	tp->snd_ssthresh = win * tp->t_maxseg;
-	ENTER_FASTRECOVERY(tp);
-	tp->snd_recover = tp->snd_max;
-	if (tp->t_flags & TF_ECN_PERMIT)
-		tp->t_flags |= TF_ECN_SND_CWR;
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
+	tp->ccv->cwnd_limited = tp->snd_cwnd == min(tp->snd_cwnd,
+	    min(tp->snd_wnd, tp->snd_bwnd));
+
+	if (type == CC_ACK) {
+		tp->t_bytes_acked += tp->ccv->bytes_this_ack;
+		if (tp->t_bytes_acked >= tp->snd_cwnd) {
+			tp->t_bytes_acked -= tp->snd_cwnd;
+			tp->ccv->abc_sentawnd = 1;
+		}
+	}
+
+	if (CC_ALGO(tp)->ack_received != NULL) {
+		/* XXXLAS: Double check that David still needs this */
+		tp->curack = th->th_ack;
+		CC_ALGO(tp)->ack_received(tp->ccv, type);
+	}
+}
+
+static void inline
+cc_conn_init(struct tcpcb *tp)
+{
+	struct hc_metrics_lite metrics;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	int rtt;
+#ifdef INET6
+	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tcp_hc_get(&inp->inp_inc, &metrics);
+
+	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+		tp->t_srtt = rtt;
+		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+		TCPSTAT_INC(tcps_usedrtt);
+		if (metrics.rmx_rttvar) {
+			tp->t_rttvar = metrics.rmx_rttvar;
+			TCPSTAT_INC(tcps_usedrttvar);
+		} else {
+			/* default variation is +- 1 rtt */
+			tp->t_rttvar =
+			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+		}
+		TCPT_RANGESET(tp->t_rxtcur,
+			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+			      tp->t_rttmin, TCPTV_REXMTMAX);
+	}
+	if (metrics.rmx_ssthresh) {
+		/*
+		 * There's some sort of gateway or interface
+		 * buffer limit on the path.  Use this to set
+		 * the slow start threshhold, but set the
+		 * threshold to no less than 2*mss.
+		 */
+		tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
+		TCPSTAT_INC(tcps_usedssthresh);
+	}
+	if (metrics.rmx_bandwidth)
+		tp->snd_bandwidth = metrics.rmx_bandwidth;
+
+	/*
+	 * Set the slow-start flight size depending on whether this
+	 * is a local network or not.
+	 *
+	 * Extend this so we cache the cwnd too and retrieve it here.
+	 * Make cwnd even bigger than RFC3390 suggests but only if we
+	 * have previous experience with the remote host. Be careful
+	 * not make cwnd bigger than remote receive window or our own
+	 * send socket buffer. Maybe put some additional upper bound
+	 * on the retrieved cwnd. Should do incremental updates to
+	 * hostcache when cwnd collapses so next connection doesn't
+	 * overloads the path again.
+	 *
+	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+	 * We currently check only in syncache_socket for that.
+	 */
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+	if (metrics.rmx_cwnd)
+		tp->snd_cwnd = max(tp->t_maxseg,
+				min(metrics.rmx_cwnd / 2,
+				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+	else
+#endif
+	if (V_tcp_do_rfc3390)
+		tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg,
+4380));
+#ifdef INET6
+	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+		 (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+	else if (in_localaddr(inp->inp_faddr))
+#endif
+		tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
+	else
+		tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
+
+	if (CC_ALGO(tp)->conn_init != NULL)
+		CC_ALGO(tp)->conn_init(tp->ccv);
+}
+
+void inline
+cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	switch(type) {
+	case CC_NDUPACK:
+		if (!IN_FASTRECOVERY(tp->t_flags)) {
+			tp->snd_recover = tp->snd_max;
+			if (tp->t_flags & TF_ECN_PERMIT)
+				tp->t_flags |= TF_ECN_SND_CWR;
+		}
+		break;
+	case CC_ECN:
+		if (!IN_CONGRECOVERY(tp->t_flags)) {
+			TCPSTAT_INC(tcps_ecn_rcwnd);
+			tp->snd_recover = tp->snd_max;
+			if (tp->t_flags & TF_ECN_PERMIT)
+				tp->t_flags |= TF_ECN_SND_CWR;
+		}
+		break;
+	case CC_RTO:
+		tp->t_dupacks = 0;
+		tp->t_bytes_acked = 0;
+		EXIT_RECOVERY(tp->t_flags);
+		tp->snd_cwnd = tp->t_maxseg;
+		break;
+	case CC_RTO_ERR:
+		TCPSTAT_INC(tcps_sndrexmitbad);
+		/* RTO was unnecessary, so reset everything. */
+		tp->snd_cwnd = tp->snd_cwnd_prev;
+		tp->snd_ssthresh = tp->snd_ssthresh_prev;
+		tp->snd_recover = tp->snd_recover_prev;
+		if (tp->t_flags & TF_WASFRECOVERY)
+			ENTER_FASTRECOVERY(tp->t_flags);
+		if (tp->t_flags & TF_WASCRECOVERY)
+			ENTER_CONGRECOVERY(tp->t_flags);
+		tp->snd_nxt = tp->snd_max;
+		tp->t_badrxtwin = 0;
+		break;
+	}
+
+	if (CC_ALGO(tp)->cong_signal != NULL) {
+		if (th != NULL)
+			tp->curack = th->th_ack;
+		CC_ALGO(tp)->cong_signal(tp->ccv, type);
+	}
+}
+
+static void inline
+cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/* XXXLAS: KASSERT that we're in recovery? */
+
+	if (CC_ALGO(tp)->post_recovery != NULL) {
+		tp->curack = th->th_ack;
+		CC_ALGO(tp)->post_recovery(tp->ccv);
+	}
+	/* XXXLAS: EXIT_RECOVERY ? */
+	tp->t_bytes_acked = 0;
 }
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
@@ -1076,7 +1235,7 @@
 	int rstreason, todrop, win;
 	u_long tiwin;
 	struct tcpopt to;
-
+	struct tcp_hhook_data hhook_data;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
@@ -1157,14 +1316,9 @@
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
-		/*
-		 * Congestion experienced.
-		 * Ignore if we are already trying to recover.
-		 */
-		if ((thflags & TH_ECE) &&
-		    SEQ_LEQ(th->th_ack, tp->snd_recover)) {
-			TCPSTAT_INC(tcps_ecn_rcwnd);
-			tcp_congestion_exp(tp);
+		/* Congestion experienced. */
+		if (thflags & TH_ECE) {
+			cc_cong_signal(tp, th, CC_ECN);
 		}
 	}
 
@@ -1259,15 +1413,9 @@
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
-			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!V_tcp_do_newreno &&
-			      !(tp->t_flags & TF_SACK_PERMIT) &&
-			      tp->t_dupacks < tcprexmtthresh) ||
-			     ((V_tcp_do_newreno ||
-			       (tp->t_flags & TF_SACK_PERMIT)) &&
-			      !IN_FASTRECOVERY(tp) &&
-			      (to.to_flags & TOF_SACK) == 0 &&
-			      TAILQ_EMPTY(&tp->snd_holes)))) {
+			    !IN_RECOVERY(tp->t_flags) &&
+			    (to.to_flags & TOF_SACK) == 0 &&
+			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
@@ -1287,15 +1435,7 @@
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    (int)(ticks - tp->t_badrxtwin) < 0) {
-					TCPSTAT_INC(tcps_sndrexmitbad);
-					tp->snd_cwnd = tp->snd_cwnd_prev;
-					tp->snd_ssthresh =
-					    tp->snd_ssthresh_prev;
-					tp->snd_recover = tp->snd_recover_prev;
-					if (tp->t_flags & TF_WASFRECOVERY)
-					    ENTER_FASTRECOVERY(tp);
-					tp->snd_nxt = tp->snd_max;
-					tp->t_badrxtwin = 0;
+					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
@@ -1322,7 +1462,16 @@
 							ticks - tp->t_rtttime);
 				}
 				tcp_xmit_bandwidth_limit(tp, th->th_ack);
-				acked = th->th_ack - tp->snd_una;
+				acked = BYTES_THIS_ACK(tp, th);
+
+				/* run necessary helper hooks for TCP_ESTABLISHED */
+				hhook_data.new_sacked_bytes = 0;
+				hhook_data.tp = tp;
+				hhook_data.th = th;
+				hhook_data.to = &to;
+				run_hhooks(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN,
+						   &hhook_data, tp->hdbs);
+
 				TCPSTAT_INC(tcps_rcvackpack);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
@@ -1359,6 +1508,15 @@
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
+				
+				/*
+				 * Let the congestion control algorithm update
+				 * congestion control related information. This
+				 * typically means increasing the congestion
+				 * window.
+				 */
+				cc_ack_received(tp, th, CC_ACK);
+
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
@@ -1588,6 +1746,7 @@
 				thflags &= ~TH_SYN;
 			} else {
 				tp->t_state = TCPS_ESTABLISHED;
+				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 			}
 		} else {
@@ -1991,6 +2150,7 @@
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tp->t_state = TCPS_ESTABLISHED;
+			cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 		}
 		/*
@@ -2021,10 +2181,21 @@
 			TCPSTAT_INC(tcps_rcvacktoomuch);
 			goto dropafterack;
 		}
+		hhook_data.new_sacked_bytes = 0;
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    ((to.to_flags & TOF_SACK) ||
-		     !TAILQ_EMPTY(&tp->snd_holes)))
+		     !TAILQ_EMPTY(&tp->snd_holes))) {
 			tcp_sack_doack(tp, &to, th->th_ack);
+			/* XXXDH: should only be one if a productive SACK */
+			hhook_data.new_sacked_bytes = 1;
+		}
+
+		hhook_data.tp = tp;
+		hhook_data.th = th;
+		hhook_data.to = &to;
+		run_hhooks(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN,
+		    &hhook_data, tp->hdbs);
+
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				TCPSTAT_INC(tcps_rcvdupack);
@@ -2059,11 +2230,10 @@
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
-				    ((V_tcp_do_newreno ||
-				      (tp->t_flags & TF_SACK_PERMIT)) &&
-				     IN_FASTRECOVERY(tp))) {
+				     IN_FASTRECOVERY(tp->t_flags)) {
+					cc_ack_received(tp, th, CC_DUPACK);
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
-					    IN_FASTRECOVERY(tp)) {
+					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 						
 						/*
@@ -2085,7 +2255,6 @@
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
-
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
@@ -2094,19 +2263,20 @@
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
-						if (IN_FASTRECOVERY(tp)) {
+						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
-					} else if (V_tcp_do_newreno ||
-					    V_tcp_do_ecn) {
+					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
-					tcp_congestion_exp(tp);
+					/* congestion signal first, then ack */
+					cc_cong_signal(tp, th, CC_NDUPACK);
+					cc_ack_received(tp, th, CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
@@ -2123,6 +2293,7 @@
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
+
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     tp->t_maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
@@ -2130,6 +2301,7 @@
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
+					cc_ack_received(tp, th, CC_DUPACK);
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
@@ -2171,37 +2343,14 @@
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
-			if (IN_FASTRECOVERY(tp)) {
-				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-					if (tp->t_flags & TF_SACK_PERMIT)
-						tcp_sack_partialack(tp, th);
-					else
-						tcp_newreno_partial_ack(tp, th);
-				} else {
-					/*
-					 * Out of fast recovery.
-					 * Window inflation should have left us
-					 * with approximately snd_ssthresh
-					 * outstanding data.
-					 * But in case we would be inclined to
-					 * send a burst, better to do it via
-					 * the slow start mechanism.
-					 */
-					if (SEQ_GT(th->th_ack +
-							tp->snd_ssthresh,
-						   tp->snd_max))
-						tp->snd_cwnd = tp->snd_max -
-								th->th_ack +
-								tp->t_maxseg;
-					else
-						tp->snd_cwnd = tp->snd_ssthresh;
-				}
-			}
-		} else {
-			if (tp->t_dupacks >= tcprexmtthresh &&
-			    tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd = tp->snd_ssthresh;
+		if (IN_FASTRECOVERY(tp->t_flags)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+				if (tp->t_flags & TF_SACK_PERMIT)
+					tcp_sack_partialack(tp, th);
+				else
+					tcp_newreno_partial_ack(tp, th);
+			} else
+				cc_post_recovery(tp, th);
 		}
 		tp->t_dupacks = 0;
 		/*
@@ -2232,7 +2381,7 @@
 		    ("tcp_input: process_ACK ti_locked %d", ti_locked));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
-		acked = th->th_ack - tp->snd_una;
+		acked = BYTES_THIS_ACK(tp, th);
 		TCPSTAT_INC(tcps_rcvackpack);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
@@ -2243,16 +2392,8 @@
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
-		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
-			TCPSTAT_INC(tcps_sndrexmitbad);
-			tp->snd_cwnd = tp->snd_cwnd_prev;
-			tp->snd_ssthresh = tp->snd_ssthresh_prev;
-			tp->snd_recover = tp->snd_recover_prev;
-			if (tp->t_flags & TF_WASFRECOVERY)
-				ENTER_FASTRECOVERY(tp);
-			tp->snd_nxt = tp->snd_max;
-			tp->t_badrxtwin = 0;	/* XXX probably not required */
-		}
+		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0)
+			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
@@ -2300,61 +2441,12 @@
 			goto step6;
 
 		/*
-		 * When new data is acked, open the congestion window.
-		 * Method depends on which congestion control state we're
-		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
-		 * enabled.
-		 *
-		 * slow start: cwnd <= ssthresh
-		 * cong avoid: cwnd > ssthresh
-		 *
-		 * slow start and ABC (RFC 3465):
-		 *   Grow cwnd exponentially by the amount of data
-		 *   ACKed capping the max increment per ACK to
-		 *   (abc_l_var * maxseg) bytes.
-		 *
-		 * slow start without ABC (RFC 2581):
-		 *   Grow cwnd exponentially by maxseg per ACK.
-		 *
-		 * cong avoid and ABC (RFC 3465):
-		 *   Grow cwnd linearly by maxseg per RTT for each
-		 *   cwnd worth of ACKed data.
-		 *
-		 * cong avoid without ABC (RFC 2581):
-		 *   Grow cwnd linearly by approximately maxseg per RTT using
-		 *   maxseg^2 / cwnd per ACK as the increment.
-		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
-		 *   avoid capping cwnd.
+		 * Let the congestion control algorithm update congestion
+		 * control related information. This typically means increasing
+		 * the congestion window.
 		 */
-		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
-		    !IN_FASTRECOVERY(tp)) {
-			u_int cw = tp->snd_cwnd;
-			u_int incr = tp->t_maxseg;
-			/* In congestion avoidance? */
-			if (cw > tp->snd_ssthresh) {
-				if (V_tcp_do_rfc3465) {
-					tp->t_bytes_acked += acked;
-					if (tp->t_bytes_acked >= tp->snd_cwnd)
-						tp->t_bytes_acked -= cw;
-					else
-						incr = 0;
-				}
-				else
-					incr = max((incr * incr / cw), 1);
-			/*
-			 * In slow-start with ABC enabled and no RTO in sight?
-			 * (Must not use abc_l_var > 1 if slow starting after an
-			 * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
-			 * snd_max check is sufficient to handle this).
-			 */
-			} else if (V_tcp_do_rfc3465 &&
-			    tp->snd_nxt == tp->snd_max)
-				incr = min(acked,
-				    V_tcp_abc_l_var * tp->t_maxseg);
-			/* ABC is on by default, so (incr == 0) frequently. */
-			if (incr > 0)
-				tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
-		}
+		cc_ack_received(tp, th, CC_ACK);
+
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
@@ -2368,16 +2460,14 @@
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		/* Detect una wraparound. */
-		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    !IN_FASTRECOVERY(tp) &&
+		if (!IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
-		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    IN_FASTRECOVERY(tp) &&
+		/* XXXLAS: Can this be moved up into cc_post_recovery? */
+		if (IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
-			EXIT_FASTRECOVERY(tp);
-			tp->t_bytes_acked = 0;
+			EXIT_RECOVERY(tp->t_flags);
 		}
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
@@ -3242,24 +3332,19 @@
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
-	int rtt, mss;
+	int mss;
 	u_long bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	int mtuflags = 0;
-#ifdef INET6
-	int isipv6;
-#endif
+
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 	
 	tcp_mss_update(tp, offer, &metrics, &mtuflags);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
-#ifdef INET6
-	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
-#endif
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
@@ -3299,73 +3384,6 @@
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
-	/*
-	 * While we're here, check the others too.
-	 */
-	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
-		tp->t_srtt = rtt;
-		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
-		TCPSTAT_INC(tcps_usedrtt);
-		if (metrics.rmx_rttvar) {
-			tp->t_rttvar = metrics.rmx_rttvar;
-			TCPSTAT_INC(tcps_usedrttvar);
-		} else {
-			/* default variation is +- 1 rtt */
-			tp->t_rttvar =
-			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
-		}
-		TCPT_RANGESET(tp->t_rxtcur,
-			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
-			      tp->t_rttmin, TCPTV_REXMTMAX);
-	}
-	if (metrics.rmx_ssthresh) {
-		/*
-		 * There's some sort of gateway or interface
-		 * buffer limit on the path.  Use this to set
-		 * the slow start threshhold, but set the
-		 * threshold to no less than 2*mss.
-		 */
-		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
-		TCPSTAT_INC(tcps_usedssthresh);
-	}
-	if (metrics.rmx_bandwidth)
-		tp->snd_bandwidth = metrics.rmx_bandwidth;
-
-	/*
-	 * Set the slow-start flight size depending on whether this
-	 * is a local network or not.
-	 *
-	 * Extend this so we cache the cwnd too and retrieve it here.
-	 * Make cwnd even bigger than RFC3390 suggests but only if we
-	 * have previous experience with the remote host. Be careful
-	 * not make cwnd bigger than remote receive window or our own
-	 * send socket buffer. Maybe put some additional upper bound
-	 * on the retrieved cwnd. Should do incremental updates to
-	 * hostcache when cwnd collapses so next connection doesn't
-	 * overloads the path again.
-	 *
-	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
-	 * We currently check only in syncache_socket for that.
-	 */
-#define TCP_METRICS_CWND
-#ifdef TCP_METRICS_CWND
-	if (metrics.rmx_cwnd)
-		tp->snd_cwnd = max(mss,
-				min(metrics.rmx_cwnd / 2,
-				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
-	else
-#endif
-	if (V_tcp_do_rfc3390)
-		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
-	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
-		 (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
-	else if (in_localaddr(inp->inp_faddr))
-#endif
-		tp->snd_cwnd = mss * V_ss_fltsz_local;
-	else
-		tp->snd_cwnd = mss * V_ss_fltsz;
 
 	/* Check the interface for TSO capabilities. */
 	if (mtuflags & CSUM_TSO)
@@ -3429,7 +3447,7 @@
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
-	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+	tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
@@ -3439,8 +3457,8 @@
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
-	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
-		tp->snd_cwnd -= th->th_ack - tp->snd_una;
+	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
+		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += tp->t_maxseg;
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_output.c	Fri Jul 23 15:02:54 2010 +1000
@@ -40,6 +40,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
+#include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
@@ -53,6 +54,7 @@
 #include <net/route.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
@@ -64,7 +66,6 @@
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
-#include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
@@ -102,11 +103,6 @@
 	CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1,
 	"Slow start flight size for local networks");
 
-VNET_DEFINE(int, tcp_do_newreno) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
-	&VNET_NAME(tcp_do_newreno), 0,
-	"Enable NewReno Algorithms");
-
 VNET_DEFINE(int, tcp_do_tso) = 1;
 #define	V_tcp_do_tso		VNET(tcp_do_tso)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
@@ -131,6 +127,19 @@
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
+static void inline	cc_after_idle(struct tcpcb *tp);
+
+/*
+ * CC wrapper hook functions
+ */
+static void inline
+cc_after_idle(struct tcpcb *tp)
+{
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (CC_ALGO(tp)->after_idle != NULL)
+		CC_ALGO(tp)->after_idle(tp->ccv);
+}
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
@@ -155,6 +164,7 @@
 	struct sackhole *p;
 	int tso = 0;
 	struct tcpopt to;
+	struct tcp_hhook_data hhook_data;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
@@ -174,26 +184,8 @@
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
-	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) {
-		/*
-		 * We have been idle for "a while" and no acks are
-		 * expected to clock out any data we send --
-		 * slow start to get ack "clock" running again.
-		 *
-		 * Set the slow-start flight size depending on whether
-		 * this is a local network or not.
-		 */
-		int ss = V_ss_fltsz;
-#ifdef INET6
-		if (isipv6) {
-			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
-				ss = V_ss_fltsz_local;
-		} else
-#endif /* INET6 */
-		if (in_localaddr(tp->t_inpcb->inp_faddr))
-			ss = V_ss_fltsz_local;
-		tp->snd_cwnd = tp->t_maxseg * ss;
-	}
+	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
+		cc_after_idle(tp);
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
@@ -230,7 +222,7 @@
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
-	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
+	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
@@ -1124,6 +1116,15 @@
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
+	hhook_data.th = th;
+	hhook_data.tp = tp;
+	hhook_data.to = &to;
+	hhook_data.len = len;
+	hhook_data.tso = tso;
+	run_hhooks(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_OUT, &hhook_data,
+	    tp->hdbs);
+
+
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_sack.c
--- a/sys/netinet/tcp_sack.c	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_sack.c	Fri Jul 23 15:02:54 2010 +1000
@@ -425,6 +425,7 @@
 	 * are received.
 	 */
 	sblkp = &sack_blocks[num_sack_blks - 1];	/* Last SACK block */
+	tp->sackhint.last_sack_ack = sblkp->end;
 	if (SEQ_LT(tp->snd_fack, sblkp->start)) {
 		/*
 		 * The highest SACK block is beyond fack.  Append new SACK
@@ -576,7 +577,7 @@
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	/* Send one or 2 segments based on how much new data was acked. */
-	if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2)
+	if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) > 2)
 		num_segs = 2;
 	tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
 	    (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_subr.c	Fri Jul 23 15:02:54 2010 +1000
@@ -41,7 +41,9 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
+#include <sys/hhook.h>
 #include <sys/kernel.h>
+#include <sys/khelp.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
@@ -62,6 +64,7 @@
 #include <net/if.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
@@ -80,7 +83,6 @@
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/ip_icmp.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
@@ -288,6 +290,8 @@
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
+	struct cc_var		ccv;
+	struct	helper_dblocks	hdbs;
 };
 
 static VNET_DEFINE(uma_zone_t, tcpcb_zone);
@@ -327,6 +331,15 @@
 {
 	int hashsize;
 
+	if (register_hhook_head(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN,
+	    HHOOK_NOWAIT) != 0)
+		printf("%s: WARNING: unable to register helper hook\n", __func__);
+	if (register_hhook_head(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_OUT,
+	    HHOOK_NOWAIT) != 0)
+		printf("%s: WARNING: unable to register helper hook\n", __func__);
+
+	cc_init();
+
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
 	if (!powerof2(hashsize)) {
@@ -692,6 +705,33 @@
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
+
+	/* Initialise cc_var struct for this tcpcb. */
+	tp->ccv = &tm->ccv;
+	tp->ccv->type = IPPROTO_TCP;
+	tp->ccv->ccvc.tcp = tp;
+
+	/*
+	 * Use the current system default CC algorithm.
+	 */
+	CC_LIST_RLOCK();
+	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
+	CC_ALGO(tp) = CC_DEFAULT();
+	CC_LIST_RUNLOCK();
+
+	if (CC_ALGO(tp)->cb_init != NULL)
+		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+			uma_zfree(V_tcpcb_zone, tm);
+			return (NULL);
+		}
+
+	tp->hdbs = &tm->hdbs;
+	tp->hdbs->class = HELPER_CLASS_TCP;
+	if (init_helper_dblocks(tp->hdbs)) {
+		uma_zfree(V_tcpcb_zone, tm);
+		return (NULL);
+	}
+
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
@@ -866,8 +906,15 @@
 	}
 	/* Disconnect offload device, if any. */
 	tcp_offload_detach(tp);
-		
 	tcp_free_sackholes(tp);
+
+	/* Allow the CC algorithm to clean up after itself. */
+	if (CC_ALGO(tp)->cb_destroy != NULL)
+		CC_ALGO(tp)->cb_destroy(tp->ccv);
+
+	destroy_helper_dblocks(tp->hdbs);
+
+	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(V_tcpcb_zone, tp);
@@ -1646,7 +1693,7 @@
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
-		EXIT_FASTRECOVERY(tp);
+		EXIT_FASTRECOVERY(tp->t_flags);
 	tcp_output_send(tp);
 	return (inp);
 }
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_timer.c	Fri Jul 23 15:02:54 2010 +1000
@@ -51,6 +51,7 @@
 #include <net/route.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
@@ -58,7 +59,6 @@
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/ip_var.h>
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
@@ -515,10 +515,14 @@
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
-		if (IN_FASTRECOVERY(tp))
+		if (IN_FASTRECOVERY(tp->t_flags))
 		  tp->t_flags |= TF_WASFRECOVERY;
 		else
 		  tp->t_flags &= ~TF_WASFRECOVERY;
+		if (IN_CONGRECOVERY(tp->t_flags))
+		  tp->t_flags |= TF_WASCRECOVERY;
+		else
+		  tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 	}
 	TCPSTAT_INC(tcps_rexmttimeo);
@@ -562,40 +566,9 @@
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
-	/*
-	 * Close the congestion window down to one segment
-	 * (we'll open it by one segment for each ack we get).
-	 * Since we probably have a window's worth of unacked
-	 * data accumulated, this "slow start" keeps us from
-	 * dumping all that data as back-to-back packets (which
-	 * might overwhelm an intermediate gateway).
-	 *
-	 * There are two phases to the opening: Initially we
-	 * open by one mss on each ack.  This makes the window
-	 * size increase exponentially with time.  If the
-	 * window is larger than the path can handle, this
-	 * exponential growth results in dropped packet(s)
-	 * almost immediately.  To get more time between
-	 * drops but still "push" the network to take advantage
-	 * of improving conditions, we switch from exponential
-	 * to linear window opening at some threshhold size.
-	 * For a threshhold, we use half the current window
-	 * size, truncated to a multiple of the mss.
-	 *
-	 * (the minimum cwnd that will give us exponential
-	 * growth is 2 mss.  We don't allow the threshhold
-	 * to go below this.)
-	 */
-	{
-		u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
-		if (win < 2)
-			win = 2;
-		tp->snd_cwnd = tp->t_maxseg;
-		tp->snd_ssthresh = win * tp->t_maxseg;
-		tp->t_dupacks = 0;
-	}
-	EXIT_FASTRECOVERY(tp);
-	tp->t_bytes_acked = 0;
+
+	cc_cong_signal(tp, 0, CC_RTO);
+
 	(void) tcp_output(tp);
 
 out:
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_usrreq.c	Fri Jul 23 15:02:54 2010 +1000
@@ -62,6 +62,7 @@
 #include <net/route.h>
 #include <net/vnet.h>
 
+#include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
@@ -77,7 +78,6 @@
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
-#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
@@ -1244,6 +1244,8 @@
 	struct	inpcb *inp;
 	struct	tcpcb *tp;
 	struct	tcp_info ti;
+	char buf[TCP_CA_NAME_MAX];
+	struct cc_algo *algo;
 
 	error = 0;
 	inp = sotoinpcb(so);
@@ -1353,6 +1355,54 @@
 			error = EINVAL;
 			break;
 
+		case TCP_CONGESTION:
+			INP_WUNLOCK(inp);
+			bzero(buf, sizeof(buf));
+			error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+			if (error)
+				break;
+			INP_WLOCK_RECHECK(inp);
+			/*
+			 * Return EINVAL if we can't find the requested cc algo.
+			 */
+			error = EINVAL;
+			CC_LIST_RLOCK();
+			STAILQ_FOREACH(algo, &cc_list, entries) {
+				if (strncmp(buf, algo->name, TCP_CA_NAME_MAX)
+				    == 0) {
+					/* We've found the requested algo. */
+					error = 0;
+					/*
+					 * We hold a write lock over the tcb
+					 * so it's safe to do these things
+					 * without ordering concerns.
+					 */
+					if (CC_ALGO(tp)->cb_destroy != NULL)
+						CC_ALGO(tp)->cb_destroy(tp->ccv);
+					CC_ALGO(tp) = algo;
+					/*
+					 * If something goes pear shaped
+					 * initialising the new algo,
+					 * fall back to newreno (which
+					 * does not require initialisation).
+					 */
+					if (algo->cb_init != NULL)
+						if (algo->cb_init(tp->ccv) > 0) {
+							CC_ALGO(tp) = &newreno_cc_algo;
+							/*
+							 * The only reason init
+							 * should fail is
+							 * because of malloc.
+							 */
+							error = ENOMEM;
+						}
+					break; /* Break the STAILQ_FOREACH. */
+				}
+			}
+			CC_LIST_RUNLOCK();
+			INP_WUNLOCK(inp);
+			break;
+
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
@@ -1396,6 +1446,12 @@
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ti, sizeof ti);
 			break;
+		case TCP_CONGESTION:
+			bzero(buf, sizeof(buf));
+			strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+			INP_WUNLOCK(inp);
+			error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX);
+			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
@@ -1709,6 +1765,10 @@
 		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
+	if (t_flags & TF_CONGRECOVERY) {
+		db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
+		comma = 1;
+	}
 	if (t_flags & TF_WASFRECOVERY) {
 		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
 		comma = 1;
diff -r 7159011c25ae -r 2d2f6f743238 sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h	Sun Jul 11 20:33:39 2010 +0000
+++ b/sys/netinet/tcp_var.h	Fri Jul 23 15:02:54 2010 +1000
@@ -43,6 +43,12 @@
  */
 VNET_DECLARE(int, tcp_do_rfc1323);
 #define	V_tcp_do_rfc1323	VNET(tcp_do_rfc1323)
+VNET_DECLARE(int, tcp_do_rfc3465);
+#define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
+VNET_DECLARE(int, tcp_abc_l_var);
+#define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
+VNET_DECLARE(int, tcp_do_rfc3390);
+#define	V_tcp_do_rfc3390	VNET(tcp_do_rfc3390)
 
 VNET_DECLARE(int, tcp_reass_qsize);
 VNET_DECLARE(struct uma_zone *, tcp_reass_zone);
@@ -74,7 +80,7 @@
 struct sackhint {
 	struct sackhole	*nexthole;
 	int		sack_bytes_rexmit;
-
+        tcp_seq		last_sack_ack; /* Last sack block acked with current pkt - used for enhanced RTT calculations*/
 	int		ispare;		/* explicit pad for 64bit alignment */
 	uint64_t	_pad[2];	/* 1 sacked_bytes, 1 TBD */
 };
@@ -122,6 +128,7 @@
 					 */
 	tcp_seq	snd_nxt;		/* send next */
 	tcp_seq	snd_up;			/* send urgent pointer */
+	tcp_seq	curack;			/* Most recent ACK */
 
 	tcp_seq	snd_wl1;		/* window update seg seq number */
 	tcp_seq	snd_wl2;		/* window update seg ack number */
@@ -199,10 +206,12 @@
 	struct toe_usrreqs *t_tu;	/* offload operations vector */
 	void	*t_toe;			/* TOE pcb pointer */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
-
 	int	t_ispare;		/* explicit pad for 64bit alignment */
 	void	*t_pspare2[6];		/* 2 CC / 4 TBD */
 	uint64_t _pad[12];		/* 7 UTO, 5 TBD (1-2 CC/RTT?) */
+	struct cc_var	*ccv;
+	struct cc_algo	*cc_algo;	/* the algorithm that will manage congestion control*/
+	struct helper_dblocks	*hdbs;
 };
 
 /*
@@ -234,10 +243,38 @@
 #define	TF_ECN_PERMIT	0x4000000	/* connection ECN-ready */
 #define	TF_ECN_SND_CWR	0x8000000	/* ECN CWR in queue */
 #define	TF_ECN_SND_ECE	0x10000000	/* ECN ECE in queue */
+#define	TF_CONGRECOVERY	0x20000000	/* Congestion recovery mode */
+#define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
 
-#define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
-#define ENTER_FASTRECOVERY(tp)	tp->t_flags |= TF_FASTRECOVERY
-#define EXIT_FASTRECOVERY(tp)	tp->t_flags &= ~TF_FASTRECOVERY
+#define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
+#define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
+#define	EXIT_FASTRECOVERY(t_flags)	t_flags &= ~TF_FASTRECOVERY
+
+#define	IN_CONGRECOVERY(t_flags)	(t_flags & TF_CONGRECOVERY)
+#define	ENTER_CONGRECOVERY(t_flags)	t_flags |= TF_CONGRECOVERY
+#define	EXIT_CONGRECOVERY(t_flags)	t_flags &= ~TF_CONGRECOVERY
+
+#define	IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
+#define	ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
+#define	EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
+
+#define BYTES_THIS_ACK(tp, th)	(th->th_ack - tp->snd_una)
+
+/*
+ * TCP specific helper hook point identifiers.
+ */
+#define	HHOOK_TCP_ESTABLISHED_IN	1
+#define	HHOOK_TCP_ESTABLISHED_OUT	2
+
+struct tcp_hhook_data {
+	struct tcpcb *tp;
+	struct tcphdr *th;
+	struct tcpopt *to;
+	long len;
+	int tso;
+	tcp_seq  curack;
+	int new_sacked_bytes;
+};
 
 /*
  * Flags for the t_oobflags field.
@@ -678,6 +715,8 @@
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 u_long	 tcp_seq_subtract(u_long, u_long );
 
+void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */
diff -r 7159011c25ae -r 2d2f6f743238 sys/sys/hhook.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/sys/hhook.h	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart,
+ * made possible in part by a grant from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_HHOOK_H_
+#define _SYS_HHOOK_H_
+
+#define	HHOOK_WAITOK	0x01
+#define	HHOOK_NOWAIT	0x02
+
+#define	HHOOK_TYPE_TCP		1
+
+struct helper;
+struct helper_dblocks;
+struct hhook_head;
+
+typedef void (*hhook_func_t)(void *udata, void *ctx_data, void *helper_dblock,
+    struct helper_dblocks *hdbs);
+
+int	register_hhook_head(int hhook_type, int hhook_id, int flags);
+int	deregister_hhook_head(int hhook_type, int hhook_id);
+int	register_hhook(int hhook_type, int hhook_id, struct helper *helper,
+    hhook_func_t hook, void *udata, int flags);
+int	deregister_hhook(int hhook_type, int hhook_id, hhook_func_t hook,
+    void *udata, int flags);
+void	run_hhooks(int hhook_type, int hhook_id, void *ctx_data,
+    struct helper_dblocks *hdbs);
+
+#define	HHOOK_HEAD_LIST_LOCK() mtx_lock(&hhook_head_list_lock)
+#define	HHOOK_HEAD_LIST_UNLOCK() mtx_unlock(&hhook_head_list_lock)
+#define	HHOOK_HEAD_LIST_LOCK_ASSERT() mtx_assert(&hhook_head_list_lock, MA_OWNED)
+
+#define	HHOOK_HEAD_LOCK_INIT(hh) rm_init(&(hh)->hh_lock, "hhook_head rm lock")
+#define	HHOOK_HEAD_LOCK_DESTROY(hh) rm_destroy(&(hh)->hh_lock)
+#define	HHOOK_HEAD_WLOCK(hh) rm_wlock(&(hh)->hh_lock)
+#define	HHOOK_HEAD_WUNLOCK(hh) rm_wunlock(&(hh)->hh_lock)
+#define	HHOOK_HEAD_RLOCK(hh,rmpt) rm_rlock(&(hh)->hh_lock, (rmpt))
+#define	HHOOK_HEAD_RUNLOCK(hh,rmpt) rm_runlock(&(hh)->hh_lock, (rmpt))
+
+#endif /* _SYS_HHOOK_H_ */
+
diff -r 7159011c25ae -r 2d2f6f743238 sys/sys/khelp.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/sys/khelp.h	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart,
+ * made possible in part by a grant from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SYS_KHELP_H_
+#define	_SYS_KHELP_H_
+
+/* XXX: Couldn't find a way around this. */
+#include <vm/uma.h>
+
+struct helper_dblock {
+	int32_t		hd_id;
+	void		*hd_block;
+};
+
+struct helper_dblocks {
+	struct helper_dblock	*blocks;
+	int32_t			nblocks;
+	uint32_t		class;
+};
+
+struct helper {
+	int (*mod_init) (void);
+	int (*mod_destroy) (void);
+	uma_zone_t	h_zone;
+#define HELPER_NAME_MAXLEN 16
+	char		h_name[HELPER_NAME_MAXLEN];
+	uint16_t	h_flags;
+	uint32_t	h_class;
+	int32_t		h_id;
+	volatile uint32_t	h_refcount;
+	STAILQ_ENTRY(helper) h_next;
+};
+
+/* Helper flags */
+#define HELPER_NEEDS_DBLOCK	0x0001
+
+/* Helper classes */
+#define HELPER_CLASS_TCP	0x00000001
+
+int	init_helper_dblocks(struct helper_dblocks *hdbs);
+int	destroy_helper_dblocks(struct helper_dblocks *hdbs);
+int	register_helper(struct helper *h);
+int	deregister_helper(struct helper *h);
+int32_t	get_helper_id(char *hname);
+void *	get_helper_dblock(struct helper_dblocks *hdbs, int32_t id);
+
+#define	HELPER_LIST_WLOCK() rw_wlock(&helper_list_lock)
+#define	HELPER_LIST_WUNLOCK() rw_wunlock(&helper_list_lock)
+#define	HELPER_LIST_RLOCK() rw_rlock(&helper_list_lock)
+#define	HELPER_LIST_RUNLOCK() rw_runlock(&helper_list_lock)
+#define	HELPER_LIST_LOCK_ASSERT() rw_assert(&helper_list_lock, RA_LOCKED)
+
+#endif /* _SYS_KHELP_H_ */
diff -r 7159011c25ae -r 2d2f6f743238 sys/sys/module_khelp.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/sys/module_khelp.h	Fri Jul 23 15:02:54 2010 +1000
@@ -0,0 +1,81 @@
+/*-
+ * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart,
+ * made possible in part by a grant from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_MODULE_KHELP_H_
+#define _SYS_MODULE_KHELP_H_
+
+struct helper_modevent_data {
+	char name[HELPER_NAME_MAXLEN];
+	struct helper *helper;
+	int uma_zsize;
+	uma_ctor umactor;
+	uma_dtor umadtor;
+};
+
+#define	DECLARE_HELPER(hname, hdata, version)				\
+	static struct helper_modevent_data hmd_##hname = {		\
+		.name = #hname,						\
+		.helper = hdata						\
+	};								\
+	static moduledata_t h_##hname = {				\
+		.name = #hname,						\
+		.evhand = helper_modevent,				\
+		.priv = &hmd_##hname					\
+	};								\
+	DECLARE_MODULE(hname, h_##hname, SI_SUB_PROTO_IFATTACHDOMAIN,	\
+	    SI_ORDER_ANY);						\
+	MODULE_VERSION(hname, version)
+
+#define	DECLARE_HELPER_UMA(hname, hdata, version, size, ctor, dtor)	\
+	static struct helper_modevent_data hmd_##hname = {		\
+		.name = #hname,						\
+		.helper = hdata,					\
+		.uma_zsize = size,					\
+		.umactor = ctor,					\
+		.umadtor = dtor						\
+	};								\
+	static moduledata_t h_##hname = {				\
+		.name = #hname,						\
+		.evhand = helper_modevent,				\
+		.priv = &hmd_##hname					\
+	};								\
+	DECLARE_MODULE(hname, h_##hname, SI_SUB_PROTO_IFATTACHDOMAIN,	\
+	    SI_ORDER_ANY);						\
+	MODULE_VERSION(hname, version)
+
+int	helper_modevent(module_t mod, int type, void *data);
+
+MALLOC_DECLARE(M_HELPER);
+MALLOC_DEFINE(M_HELPER, "helper data", "Blah");
+
+
+#endif /* _SYS_MODULE_KHELP_H_ */