--- Copyright (c) 2007, Centre for Advanced Internet Architectures --- Swinburne University of Technology, Melbourne, Australia --- (CRICOS number 00111D). --- --- CAIA Modular Congestion Control Patch v0.9.1 --- --- This patch was created against the FreeBSD 7.0-BETA4 source tree --- cvsup'd on 6th December 2007. --- --- This software was developed by James Healy --- and Lawrence Stewart --- --- All rights reserved. --- --- Redistribution and use in source and binary forms, with or without --- modification, are permitted provided that the following conditions --- are met: --- 1. Redistributions of source code must retain the above copyright --- notice, this list of conditions and the following disclaimer. --- 2. Redistributions in binary form must reproduce the above copyright --- notice, this list of conditions and the following disclaimer in the --- documentation and/or other materials provided with the distribution. --- 3. The names of the authors, the "Centre for Advanced Internet Architectures" --- and "Swinburne University of Technology" may not be used to endorse --- or promote products derived from this software without specific --- prior written permission. --- --- THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS \`\`AS IS'' AND --- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --- ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE --- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS --- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) --- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT --- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY --- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF --- SUCH DAMAGE. --- --- sys/netinet/tcp_cc_functions.h.orig 1970-01-01 10:00:00.000000000 +1000 +++ sys/netinet/tcp_cc_functions.h 2007-11-22 11:59:45.000000000 +1100 @@ -0,0 +1,38 @@ +#ifndef _NETINET_TCP_CC_FUNCTIONS_H_ +#define _NETINET_TCP_CC_FUNCTIONS_H_ + +#include +#include + +/* + * Global CC vars + */ +extern STAILQ_HEAD(tcp_cc_head, tcp_cc_functions) tcp_cc_list; +extern char tcp_cc_algorithm[]; +extern const int tcprexmtthresh; +extern struct tcp_cc_functions newreno_cc_functions; + +/* + * Define the new net.inet.tcp.cc sysctl tree + */ +SYSCTL_DECL(_net_inet_tcp_cc); + +/* + * CC housekeeping functions + */ +void tcp_cc_init(void); +void tcp_cc_register_algorithm(struct tcp_cc_functions *add_cc); +void tcp_cc_deregister_algorithm(struct tcp_cc_functions *remove_cc); + +/* + * NewReno CC functions + */ +int newreno_init(struct tcpcb *tp); +void newreno_cwnd_init(struct tcpcb *tp); +void newreno_ack_received(struct tcpcb *tp); +void newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); +void newreno_after_idle(struct tcpcb *tp); +void newreno_after_timeout(struct tcpcb *tp); +void newreno_ssthresh_update(struct tcpcb *tp); + +#endif /* _NETINET_TCP_CC_FUNCTIONS_H_ */ --- sys/netinet/tcp_cc_functions.c.orig 1970-01-01 10:00:00.000000000 +1000 +++ sys/netinet/tcp_cc_functions.c 2007-11-27 11:33:56.000000000 +1100 @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +// list of available cc algorithms on the current system +struct tcp_cc_head tcp_cc_list = STAILQ_HEAD_INITIALIZER(tcp_cc_list); + +MALLOC_DECLARE(M_STRING); +MALLOC_DEFINE(M_STRING, "string", "a string"); + +// create a struct to point to our newreno functions +struct tcp_cc_functions newreno_cc_functions = { + .name = "newreno", + .init = newreno_init, + .deinit = NULL, + .tcp_cwnd_init = newreno_cwnd_init, + .tcp_ack_received = newreno_ack_received, + .tcp_pre_fr = newreno_ssthresh_update, + .tcp_post_fr = newreno_post_fr, + .tcp_after_idle = newreno_after_idle, + .tcp_after_timeout = newreno_after_timeout +}; + +// the system wide default cc algorithm +char tcp_cc_algorithm[TCP_CC_MAX_ALGORITHM_NAME_LEN+1]; + +// sysctl handler that allows the default cc algorithm for the system to be +// viewed and changed +static int +tcp_cc_default_algorithm(SYSCTL_HANDLER_ARGS) +{ + struct tcp_cc_functions *funcs; + + if (!req->newptr) + goto skip; + + STAILQ_FOREACH(funcs, &tcp_cc_list, entries) + { + if (strncmp((char *)req->newptr, funcs->name, TCP_CC_MAX_ALGORITHM_NAME_LEN) == 0) + goto reorder; + } + + return 1; + +reorder: + // Make the selected system default cc algorithm the first element in the list if it isn't already + if(funcs != STAILQ_FIRST(&tcp_cc_list)) + { + STAILQ_REMOVE(&tcp_cc_list, funcs, tcp_cc_functions, entries); + STAILQ_INSERT_HEAD(&tcp_cc_list, funcs, entries); + } + +skip: + return sysctl_handle_string(oidp, arg1, arg2, req); +} + +// sysctl handler that displays the available cc algorithms as a read +// only value +static int +tcp_cc_list_available(SYSCTL_HANDLER_ARGS) +{ + struct tcp_cc_functions *funcs; + int error = 0, pos = 0; + char buf[16]; + + STAILQ_FOREACH(funcs, &tcp_cc_list, entries) + { + if (pos == 0) + sprintf(buf, "%s", funcs->name); + else + sprintf(buf, ", %s", funcs->name); + + error = sysctl_handle_opaque(oidp, buf, strlen(buf), req); + if (error) + return error; + + pos++; + } + + buf[0] = '\0'; + error = sysctl_handle_opaque(oidp, buf, 1, req); + return error; +} + +// initialise cc on system boot +void +tcp_cc_init() +{ + // initilize list of cc algorithms + STAILQ_INIT(&tcp_cc_list); + + // add newreno to the list of available algorithms + tcp_cc_register_algorithm(&newreno_cc_functions); + + // set newreno to the system default + strncpy(tcp_cc_algorithm, newreno_cc_functions.name, sizeof(tcp_cc_algorithm)); +} + +void +tcp_cc_deregister_algorithm(struct tcp_cc_functions *remove_cc) +{ + struct tcp_cc_functions *funcs, *tmpfuncs; + register struct tcpcb *tp = NULL; + register struct inpcb *inp = NULL; + + // remove the algorithm from the list available to the system + STAILQ_FOREACH_SAFE(funcs, &tcp_cc_list, entries, tmpfuncs) + { + if (funcs == remove_cc) + { + // if this algorithm is the system default, reset the default to newreno + if (strncmp(tcp_cc_algorithm, remove_cc->name, TCP_CC_MAX_ALGORITHM_NAME_LEN) == 0) + snprintf(tcp_cc_algorithm,TCP_CC_MAX_ALGORITHM_NAME_LEN, "%s", newreno_cc_functions.name); + + STAILQ_REMOVE(&tcp_cc_list, funcs, tcp_cc_functions, entries); + + break; + } + } + + // check all active control blocks and change any that are using this + // algorithm back to newreno. If the algorithm that was in use requires + // deinit code to be run, call it + // TODO: do we need to hold a lock while accessing the tcp control block list + LIST_FOREACH(inp, &tcb, inp_list) + { + tp = intotcpcb(inp); + + // TODO: this if *shouldn't* be necesary. I'm checking tp just to be safe, but i've never + // seen it null here. tp->cc_functions is reguarly null. When unloading a cc + // module, there are usually a few control blocks in this list that need to + // be checked, and the first one often has a null cc_functions, which is obviously + // less than ideal. The control block with the null cc_functions is also in a + // memory address that isn't initialised by the tcp_newtcpcb function in + // tcp_subr.c. We can't work out where it's created at this stage. + if (tp && tp->cc_functions) + { + if (strncmp(tp->cc_functions->name, remove_cc->name,TCP_CC_MAX_ALGORITHM_NAME_LEN) == 0 ) + { + tmpfuncs = tp->cc_functions; + tp->cc_functions = &newreno_cc_functions; + if (tmpfuncs->deinit) + tmpfuncs->deinit(tp); + } + } + } +} + +void +tcp_cc_register_algorithm(struct tcp_cc_functions *add_cc) +{ + STAILQ_INSERT_TAIL(&tcp_cc_list, add_cc, entries); +} + +/* + * NEW RENO + */ + +int +newreno_init(struct tcpcb *tp) +{ + printf("initialising tcp connection with newreno congestion control\n"); + return 0; +} + +// update ssthresh to approx 1/2 of cwnd +void +newreno_ssthresh_update(struct tcpcb *tp) +{ + u_int win; + + // reset ssthresh + win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + + if (win < 2) + win = 2; + + tp->snd_ssthresh = win * tp->t_maxseg; +} + +// initial cwnd at the start of a connection +// if there is a hostcache entry for the foreign host, base cwnd on that +// if rfc3390 is enabled, set cwnd to approx 4 MSS as recommended +// otherwise use the sysctl variables configured by the administrator +void +newreno_cwnd_init(struct tcpcb *tp) +{ + struct hc_metrics_lite metrics; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. + */ + + tcp_hc_get(&inp->inp_inc, &metrics); + +#define TCP_METRICS_CWND +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(tp->t_maxseg, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif + if (tcp_do_rfc3390) + tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); +#ifdef INET6 + else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && in_localaddr(inp->inp_faddr))) +#else + else if (in_localaddr(inp->inp_faddr)) +#endif + tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * ss_fltsz; +} + +// increase cwnd on receipt of a successful ACK +// if cwnd <= ssthresh, increases by 1 MSS per ACK +// if cwnd > ssthresh, increase by ~1 MSS per RTT +void +newreno_ack_received(struct tcpcb *tp) +{ + u_int cw = tp->snd_cwnd; + u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); +} + +// decrease the cwnd in response to packet loss or a transmit timeout. +// th can be null, in which case cwnd will be set according to reno instead +// of new reno. +void +newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) +{ + /* + * Out of fast recovery. + * Window inflation should have left us + * with approximately snd_ssthresh + * outstanding data. + * But in case we would be inclined to + * send a burst, better to do it via + * the slow start mechanism. + */ + if (th && SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; +} + +// if a connection has been idle for a while and more data is ready to be sent, +// reset cwnd +void +newreno_after_idle(struct tcpcb *tp) +{ + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + int ss = ss_fltsz; + +#ifdef INET6 + if (isipv6) { + if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) + ss = ss_fltsz_local; + } else +#endif /* INET6 */ + + if (in_localaddr(tp->t_inpcb->inp_faddr)) + ss = ss_fltsz_local; + + tp->snd_cwnd = tp->t_maxseg * ss; +} + +// reset the cwnd after a transmission timeout. +void +newreno_after_timeout(struct tcpcb *tp) +{ + newreno_ssthresh_update(tp); + + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + tp->snd_cwnd = tp->t_maxseg; +} + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, "TCP congestion control related settings"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, &tcp_cc_algorithm, sizeof(tcp_cc_algorithm), tcp_cc_default_algorithm, "A", "default tcp congestion algorithm"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, tcp_cc_list_available, "A", "list available tcp congestion algorithms"); --- sys/conf/files.orig 2007-12-07 16:51:38.000000000 +1100 +++ sys/conf/files 2007-12-07 16:18:28.000000000 +1100 @@ -1880,6 +1880,7 @@ netinet/sctp_usrreq.c optional inet inet6 sctp netinet/sctputil.c optional inet inet6 sctp netinet/tcp_debug.c optional tcpdebug +netinet/tcp_cc_functions.c optional inet netinet/tcp_hostcache.c optional inet netinet/tcp_input.c optional inet netinet/tcp_output.c optional inet --- sys/netinet/tcp_input.c.orig 2007-12-07 16:43:36.000000000 +1100 +++ sys/netinet/tcp_input.c 2007-11-22 11:59:45.000000000 +1100 @@ -96,7 +96,7 @@ #include -static const int tcprexmtthresh = 3; +const int tcprexmtthresh = 3; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, @@ -123,7 +123,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); -static int tcp_do_rfc3390 = 1; +int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); @@ -1000,14 +1000,9 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && - !(tp->t_flags & TF_SACK_PERMIT) && - tp->t_dupacks < tcprexmtthresh) || - ((tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && - (to.to_flags & TOF_SACK) == 0 && - TAILQ_EMPTY(&tp->snd_holes)))) { + !IN_FASTRECOVERY(tp) && + (to.to_flags & TOF_SACK) == 0 && + TAILQ_EMPTY(&tp->snd_holes)) { KASSERT(headlocked, ("%s: headlocked", __func__)); INP_INFO_WUNLOCK(&tcbinfo); @@ -1759,13 +1754,14 @@ * to keep a constant cwnd packets in the * network. */ + if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; + else if (++tp->t_dupacks > tcprexmtthresh || - ((tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp))) { + IN_FASTRECOVERY(tp)) { + if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp)) { int awnd; @@ -1789,7 +1785,6 @@ goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; - u_int win; /* * If we're doing sack, check to @@ -1803,22 +1798,26 @@ tp->t_dupacks = 0; break; } - } else if (tcp_do_newreno) { + } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; + + // If the current tcp cc module has defined a hook + // for tasks to run before entering FR, call it + if (tp->cc_functions->tcp_pre_fr) + tp->cc_functions->tcp_pre_fr(tp); + ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; + + // if SACK is enabled, set some variables in the control block, + // send the lost packet and then finish processing this packet if (tp->t_flags & TF_SACK_PERMIT) { tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; @@ -1826,18 +1825,23 @@ (void) tcp_output(tp); goto drop; } + tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, - ("%s: tp->snd_limited too big", - __func__)); + ("%s: tp->snd_limited too big", + __func__)); + // set cwnd to an appropriate value as we enter fast recovery tp->snd_cwnd = tp->snd_ssthresh + - tp->t_maxseg * - (tp->t_dupacks - tp->snd_limited); + tp->t_maxseg * + (tp->t_dupacks - tp->snd_limited); + if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; + goto drop; + } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; @@ -1880,38 +1884,17 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { - if (IN_FASTRECOVERY(tp)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { - if (tp->t_flags & TF_SACK_PERMIT) - tcp_sack_partialack(tp, th); - else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - } - } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - } + if (IN_FASTRECOVERY(tp)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->t_flags & TF_SACK_PERMIT) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + } else { + if (tp->cc_functions->tcp_post_fr) + tp->cc_functions->tcp_post_fr(tp, th); + } + } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, @@ -2014,13 +1997,9 @@ * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || - !IN_FASTRECOVERY(tp)) { - u_int cw = tp->snd_cwnd; - u_int incr = tp->t_maxseg; - if (cw > tp->snd_ssthresh) - incr = incr * incr / cw; - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); + if (!IN_FASTRECOVERY(tp)) { + if (tp->cc_functions->tcp_ack_received) + tp->cc_functions->tcp_ack_received(tp); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { @@ -2035,14 +2014,11 @@ /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ - if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && + if (!IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp) && - SEQ_GEQ(th->th_ack, tp->snd_recover)) + if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { @@ -2909,41 +2885,11 @@ if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; - /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - * - * Extend this so we cache the cwnd too and retrieve it here. - * Make cwnd even bigger than RFC3390 suggests but only if we - * have previous experience with the remote host. Be careful - * not make cwnd bigger than remote receive window or our own - * send socket buffer. Maybe put some additional upper bound - * on the retrieved cwnd. Should do incremental updates to - * hostcache when cwnd collapses so next connection doesn't - * overloads the path again. - * - * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. - * We currently check only in syncache_socket for that. - */ -#define TCP_METRICS_CWND -#ifdef TCP_METRICS_CWND - if (metrics.rmx_cwnd) - tp->snd_cwnd = max(mss, - min(metrics.rmx_cwnd / 2, - min(tp->snd_wnd, so->so_snd.sb_hiwat))); - else -#endif - if (tcp_do_rfc3390) - tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); -#ifdef INET6 - else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr))) -#else - else if (in_localaddr(inp->inp_faddr)) -#endif - tp->snd_cwnd = mss * ss_fltsz_local; + // set the initial cwnd value + if (tp->cc_functions->tcp_cwnd_init) + tp->cc_functions->tcp_cwnd_init(tp); else - tp->snd_cwnd = mss * ss_fltsz; + tp->snd_cwnd = mss; /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) --- sys/netinet/tcp_output.c.orig 2007-12-07 16:43:23.000000000 +1100 +++ sys/netinet/tcp_output.c 2007-12-07 13:11:01.000000000 +1100 @@ -98,10 +98,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, &ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_newreno = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - &tcp_do_newreno, 0, "Enable NewReno Algorithms"); - int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); @@ -162,24 +158,9 @@ */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. - */ - int ss = ss_fltsz; -#ifdef INET6 - if (isipv6) { - if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = ss_fltsz_local; - } else -#endif /* INET6 */ - if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = ss_fltsz_local; - tp->snd_cwnd = tp->t_maxseg * ss; + // reset cwnd after a period of idleness + if (tp->cc_functions->tcp_after_idle) + tp->cc_functions->tcp_after_idle(tp); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { --- sys/netinet/tcp_subr.c.orig 2007-12-07 16:43:06.000000000 +1100 +++ sys/netinet/tcp_subr.c 2007-12-07 13:11:01.000000000 +1100 @@ -84,6 +84,7 @@ #include #include #include +#include #include #ifdef INET6 #include @@ -267,6 +268,8 @@ tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; + tcp_cc_init(); + INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.ipi_listhead = &tcb; @@ -590,7 +593,20 @@ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); + tp = &tm->tcb; + + // use the current system default cc algorithm, which is always the first + // algorithm in tcp_cc_list + tp->cc_functions = STAILQ_FIRST(&tcp_cc_list); + + // if the cc module fails to initialize, stop building the control block + if (tp->cc_functions->init(tp) > 0) + { + uma_zfree(tcpcb_zone, tp); + return NULL; + } + tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = @@ -749,6 +765,13 @@ tp->t_segqlen--; tcp_reass_qsize--; } + + /* allow the congestion control algorithm in use for this control + * block to clean up after itself + */ + if (tp->cc_functions->deinit) + tp->cc_functions->deinit(tp); + tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; --- sys/netinet/tcp_syncache.c.orig 2007-12-07 16:42:13.000000000 +1100 +++ sys/netinet/tcp_syncache.c 2007-12-07 13:11:01.000000000 +1100 @@ -1184,7 +1184,7 @@ if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif - if (to->to_flags & TOF_SACK) + if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ --- sys/netinet/tcp_timer.c.orig 2007-12-07 16:42:30.000000000 +1100 +++ sys/netinet/tcp_timer.c 2007-11-21 15:37:45.000000000 +1100 @@ -518,38 +518,12 @@ * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; - /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) - */ - { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_dupacks = 0; - } + + if (tp->cc_functions->tcp_after_timeout) + tp->cc_functions->tcp_after_timeout(tp); + + tp->t_dupacks = 0; + EXIT_FASTRECOVERY(tp); (void) tcp_output(tp); --- sys/netinet/tcp_var.h.orig 2007-12-07 16:42:45.000000000 +1100 +++ sys/netinet/tcp_var.h 2007-11-22 11:59:45.000000000 +1100 @@ -206,6 +206,9 @@ int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ + + struct tcp_cc_functions *cc_functions; /* the functions that will manage congestion control*/ + void *cc_data; /* pointer to a struct containing data required for the cc algorithm in use */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -446,6 +449,43 @@ }; #endif +#define TCP_CC_MAX_ALGORITHM_NAME_LEN 15 + +/* + * Structure to hold function pointers to the functions responsible + * for congestion control. Based on similar structure in the SCTP stack + */ +struct tcp_cc_functions { + char name[TCP_CC_MAX_ALGORITHM_NAME_LEN]; + + // init the congestion algorithm for the specified control block + int (*init) (struct tcpcb *tp); + + // deinit the congestion algorithm for the specified control block + void (*deinit) (struct tcpcb *tp); + + // initilise cwnd at the start of a connection + void (*tcp_cwnd_init) (struct tcpcb *tp); + + // called on the receipt of a valid ack + void (*tcp_ack_received) (struct tcpcb *tp); + + // hook to perform any necesary tasks before entering FR + void (*tcp_pre_fr) (struct tcpcb *tp); + + // called after exiting fast recovery + void (*tcp_post_fr) (struct tcpcb *tp, struct tcphdr *th); + + // perform tasks when data transfer resumes after an idle period + void (*tcp_after_idle) (struct tcpcb *tp); + + // perform tasks when the connection's retransmit timer expires + void (*tcp_after_timeout) (struct tcpcb *tp); + + // list magic, feel free to ignore + STAILQ_ENTRY(tcp_cc_functions) entries; +}; + /* * Names for TCP sysctl objects */ @@ -498,7 +538,7 @@ extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_delack_enabled; -extern int tcp_do_newreno; +extern int tcp_do_rfc3390; extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local;