--- Copyright (c) 2007-2008, Centre for Advanced Internet Architectures --- Swinburne University of Technology, Melbourne, Australia --- (CRICOS number 00111D). --- Copyright (c) 2008-2009, Lawrence Stewart --- --- All rights reserved. --- --- CAIA Modular Congestion Control Patch v0.9.2 --- --- This patch was created against revision 190777 of the FreeBSD 8-CURRENT --- Subversion source tree by running the following command and cleaning up the --- output to remove irrelevant parts: --- --- svn diff http://svn.freebsd.org/base/head/sys@190777 \ --- http://svn.freebsd.org/base/projects/tcp_cc_8.x/sys@190777 --- --- To obtain the correct revision of the FreeBSD source tree that this patch --- applies to, and store it in the local directory "/path/to/src", run: --- --- svn co -r 190777 http://svn.freebsd.org/base/head --- --- Make sure the base system you are installing onto is already running --- FreeBSD 8.x before continuing. --- --- Issuing the following commands will result in a running modular congestion --- control capable system: --- --- cd /path/to/src/sys --- patch -p0 < /path/to/caia_modularcc_v0.9.2_8.x.r190777.patch --- cd /path/to/src/ --- make buildworld buildkernel installkernel installworld --- mergemaster -iF -m /path/to/src --- reboot --- --- The modular congestion control patch was first released in 2007 by --- James Healy and Lawrence Stewart whilst working on the NewTCP research --- project at Swinburne University's Centre for Advanced Internet --- Architectures, Melbourne, Australia, which was made possible in part by a --- grant from the Cisco University Research Program Fund at --- Community Foundation Silicon Valley. More details are available at: --- http://caia.swin.edu.au/urp/newtcp/ --- --- Lawrence Stewart has continued development of this work since 2008 in his --- spare time, and is currently the sole maintainer. All contact regarding --- this patch should be directed to him via email: lastewart@swin.edu.au --- --- Redistribution and use in source and binary forms, with or without --- modification, are permitted provided that the following conditions --- are met: --- 1. Redistributions of source code must retain the above copyright --- notice, this list of conditions and the following disclaimer. --- 2. Redistributions in binary form must reproduce the above copyright --- notice, this list of conditions and the following disclaimer in the --- documentation and/or other materials provided with the distribution. --- 3. The names of the authors, "Swinburne University of Technology" and the --- "Centre for Advanced Internet Architectures" may not be used to endorse --- or promote products derived from this software without specific --- prior written permission. --- --- THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS \`\`AS IS'' AND --- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --- ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE --- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS --- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) --- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT --- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY --- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF --- SUCH DAMAGE. --- Index: conf/files =================================================================== --- conf/files (.../head/sys) (revision 190777) +++ conf/files (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -2337,6 +2337,7 @@ netinet/ip_options.c optional inet netinet/ip_output.c optional inet netinet/raw_ip.c optional inet +netinet/cc.c optional inet netinet/sctp_asconf.c optional inet sctp netinet/sctp_auth.c optional inet sctp netinet/sctp_bsd_addr.c optional inet sctp Index: netinet/tcp_input.c =================================================================== --- netinet/tcp_input.c (.../head/sys) (revision 190777) +++ netinet/tcp_input.c (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -62,6 +62,7 @@ #define TCPSTATES /* for logging */ +#include #include #include #include @@ -76,7 +77,6 @@ #include #include #include -#include #include #include #include @@ -102,7 +102,7 @@ #include -static const int tcprexmtthresh = 3; +const int tcprexmtthresh = 3; #ifdef VIMAGE_GLOBALS struct tcpstat tcpstat; @@ -1264,14 +1264,9 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!V_tcp_do_newreno && - !(tp->t_flags & TF_SACK_PERMIT) && - tp->t_dupacks < tcprexmtthresh) || - ((V_tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && - (to.to_flags & TOF_SACK) == 0 && - TAILQ_EMPTY(&tp->snd_holes)))) { + !IN_FASTRECOVERY(tp) && + (to.to_flags & TOF_SACK) == 0 && + TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ @@ -2061,9 +2056,7 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((V_tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp))) { + IN_FASTRECOVERY(tp)) { if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp)) { int awnd; @@ -2100,14 +2093,24 @@ tp->t_dupacks = 0; break; } - } else if (V_tcp_do_newreno || - V_tcp_do_ecn) { + } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } + + /* + * If the current tcp cc module has + * defined a hook for tasks to run + * before entering FR, call it + */ + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp, th); + + ENTER_FASTRECOVERY(tp); + tp->snd_recover = tp->snd_max; tcp_congestion_exp(tp); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; @@ -2172,37 +2175,16 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { - if (IN_FASTRECOVERY(tp)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { - if (tp->t_flags & TF_SACK_PERMIT) - tcp_sack_partialack(tp, th); - else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - } + if (IN_FASTRECOVERY(tp)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->t_flags & TF_SACK_PERMIT) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + } else { + if (CC_ALGO(tp)->post_fr != NULL) + CC_ALGO(tp)->post_fr(tp, th); } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; /* @@ -2302,59 +2284,12 @@ /* * When new data is acked, open the congestion window. - * Method depends on which congestion control state we're - * in (slow start or cong avoid) and if ABC (RFC 3465) is - * enabled. - * - * slow start: cwnd <= ssthresh - * cong avoid: cwnd > ssthresh - * - * slow start and ABC (RFC 3465): - * Grow cwnd exponentially by the amount of data - * ACKed capping the max increment per ACK to - * (abc_l_var * maxseg) bytes. - * - * slow start without ABC (RFC 2581): - * Grow cwnd exponentially by maxseg per ACK. - * - * cong avoid and ABC (RFC 3465): - * Grow cwnd linearly by maxseg per RTT for each - * cwnd worth of ACKed data. - * - * cong avoid without ABC (RFC 2581): - * Grow cwnd linearly by approximately maxseg per RTT using - * maxseg^2 / cwnd per ACK as the increment. - * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to - * avoid capping cwnd. + * The specifics of how this is achieved are up to the + * congestion control algorithm in use for this connection. */ - if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || - !IN_FASTRECOVERY(tp)) { - u_int cw = tp->snd_cwnd; - u_int incr = tp->t_maxseg; - /* In congestion avoidance? */ - if (cw > tp->snd_ssthresh) { - if (V_tcp_do_rfc3465) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked >= tp->snd_cwnd) - tp->t_bytes_acked -= cw; - else - incr = 0; - } - else - incr = max((incr * incr / cw), 1); - /* - * In slow-start with ABC enabled and no RTO in sight? - * (Must not use abc_l_var > 1 if slow starting after an - * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt == - * snd_max check is sufficient to handle this). - */ - } else if (V_tcp_do_rfc3465 && - tp->snd_nxt == tp->snd_max) - incr = min(acked, - V_tcp_abc_l_var * tp->t_maxseg); - /* ABC is on by default, so (incr == 0) frequently. */ - if (incr > 0) - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); + if (!IN_FASTRECOVERY(tp)) { + if (CC_ALGO(tp)->ack_received != NULL) + CC_ALGO(tp)->ack_received(tp, th); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { @@ -2369,13 +2304,11 @@ /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ - if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && + if (!IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp) && + if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_FASTRECOVERY(tp); tp->t_bytes_acked = 0; @@ -3336,41 +3269,9 @@ if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; - /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - * - * Extend this so we cache the cwnd too and retrieve it here. - * Make cwnd even bigger than RFC3390 suggests but only if we - * have previous experience with the remote host. Be careful - * not make cwnd bigger than remote receive window or our own - * send socket buffer. Maybe put some additional upper bound - * on the retrieved cwnd. Should do incremental updates to - * hostcache when cwnd collapses so next connection doesn't - * overloads the path again. - * - * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. - * We currently check only in syncache_socket for that. - */ -#define TCP_METRICS_CWND -#ifdef TCP_METRICS_CWND - if (metrics.rmx_cwnd) - tp->snd_cwnd = max(mss, - min(metrics.rmx_cwnd / 2, - min(tp->snd_wnd, so->so_snd.sb_hiwat))); - else -#endif - if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); -#ifdef INET6 - else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr))) -#else - else if (in_localaddr(inp->inp_faddr)) -#endif - tp->snd_cwnd = mss * V_ss_fltsz_local; - else - tp->snd_cwnd = mss * V_ss_fltsz; + /* set the initial cwnd value */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) Index: netinet/tcp_subr.c =================================================================== --- netinet/tcp_subr.c (.../head/sys) (revision 190777) +++ netinet/tcp_subr.c (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -49,6 +49,8 @@ #ifdef INET6 #include #endif +#include +#include #include #include #include @@ -62,6 +64,7 @@ #include #include +#include #include #include #include @@ -80,7 +83,6 @@ #include #endif #include -#include #include #include #include @@ -361,6 +363,8 @@ V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; + cc_init(); + TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); @@ -690,6 +694,21 @@ if (tm == NULL) return (NULL); tp = &tm->tcb; + + /* + * use the current system default cc algorithm, which is always + * the first algorithm in cc_list + */ + CC_LIST_RLOCK(); + CC_ALGO(tp) = STAILQ_FIRST(&cc_list); + CC_LIST_RUNLOCK(); + + /* if the cc module fails to init, stop building the control block */ + if (CC_ALGO(tp)->init(tp) > 0) { + uma_zfree(tcpcb_zone, tp); + return NULL; + } + tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = @@ -852,8 +871,13 @@ } /* Disconnect offload device, if any. */ tcp_offload_detach(tp); - tcp_free_sackholes(tp); + + /* Allow the cc algorithm in use for this cb to clean up after itself */ + if (CC_ALGO(tp)->deinit != NULL) + CC_ALGO(tp)->deinit(tp); + + CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); Index: netinet/tcp_timer.c =================================================================== --- netinet/tcp_timer.c (.../head/sys) (revision 190777) +++ netinet/tcp_timer.c (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,6 @@ #include #endif #include -#include #include #include #include @@ -554,38 +554,11 @@ * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; - /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) - */ - { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_dupacks = 0; - } + + if (CC_ALGO(tp)->after_timeout != NULL) + CC_ALGO(tp)->after_timeout(tp); + + tp->t_dupacks = 0; EXIT_FASTRECOVERY(tp); tp->t_bytes_acked = 0; (void) tcp_output(tp); Index: netinet/tcp_var.h =================================================================== --- netinet/tcp_var.h (.../head/sys) (revision 190777) +++ netinet/tcp_var.h (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -190,6 +190,8 @@ struct toe_usrreqs *t_tu; /* offload operations vector */ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ + struct cc_algo *cc_algo; /* the algorithm that will manage congestion control*/ + void *cc_data; /* pointer to a struct containing data required for the cc algorithm in use */ }; /* @@ -527,7 +529,7 @@ extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_delack_enabled; -extern int tcp_do_newreno; +extern int tcp_do_rfc3390; extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; Index: netinet/tcp_output.c =================================================================== --- netinet/tcp_output.c (.../head/sys) (revision 190777) +++ netinet/tcp_output.c (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -54,6 +54,7 @@ #include #include +#include #include #include #include @@ -65,7 +66,6 @@ #include #include #endif -#include #define TCPOUTFLAGS #include #include @@ -111,9 +111,6 @@ local_slowstart_flightsize, CTLFLAG_RW, ss_fltsz_local, 1, "Slow start flight size for local networks"); -SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - tcp_do_newreno, 0, "Enable NewReno Algorithms"); - SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, tcp_do_tso, 0, "Enable TCP Segmentation Offload"); @@ -174,24 +171,9 @@ */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. - */ - int ss = V_ss_fltsz; -#ifdef INET6 - if (isipv6) { - if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = V_ss_fltsz_local; - } else -#endif /* INET6 */ - if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = V_ss_fltsz_local; - tp->snd_cwnd = tp->t_maxseg * ss; + /* reset cwnd after a period of idleness */ + if (CC_ALGO(tp)->after_idle != NULL) + CC_ALGO(tp)->after_idle(tp); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { Index: netinet/tcp_usrreq.c =================================================================== --- netinet/tcp_usrreq.c (.../head/sys) (revision 190777) +++ netinet/tcp_usrreq.c (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -62,6 +62,7 @@ #include #include +#include #include #include #ifdef INET6 @@ -77,7 +78,6 @@ #include #include #endif -#include #include #include #include @@ -1256,6 +1256,8 @@ struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; + char buf[TCP_CA_NAME_MAX]; + struct cc_algo *cc_algo; error = 0; inp = sotoinpcb(so); @@ -1365,6 +1367,58 @@ error = EINVAL; break; + case TCP_CONGESTION: + INP_WUNLOCK(inp); + bzero(buf, sizeof(buf)); + error = sooptcopyin(sopt, &buf, sizeof(buf), 1); + if (error) + break; + INP_WLOCK_RECHECK(inp); + /* + * We return EINVAL if we can't find the requested cc + * algo. We set error here and reset to 0 if found to + * simplify the error checking, + */ + error = EINVAL; + CC_LIST_RLOCK(); + STAILQ_FOREACH(cc_algo, &cc_list, entries) { + if ( strncmp(buf, + cc_algo->name, + TCP_CA_NAME_MAX) == 0) { + /* + * we've found the requested algo, + * so revert the EINVAL error condition. + */ + error = 0; + /* + * we hold a write lock over the tcb + * so it's safe to do these things + * without ordering concerns + */ + if (CC_ALGO(tp)->deinit != NULL) + CC_ALGO(tp)->deinit(tp); + CC_ALGO(tp) = cc_algo; + /* + * if something goes pear shaped + * initialising the new algo, + * fall back to newreno (which + * does not require initialisation) + */ + if (cc_algo->init(tp) > 0) { + CC_ALGO(tp) = &newreno_cc_algo; + /* + * the only reason init() should + * fail is because of malloc + */ + error = ENOMEM; + } + break; /* break the STAILQ_FOREACH */ + } + } + CC_LIST_RUNLOCK(); + INP_WUNLOCK(inp); + break; + default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1408,6 +1462,12 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; + case TCP_CONGESTION: + bzero(buf, sizeof(buf)); + strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; Index: netinet/cc.c =================================================================== --- netinet/cc.c (.../head/sys) (revision 0) +++ netinet/cc.c (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -0,0 +1,454 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. + * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia + * All rights reserved. + * + * The majority of this software was developed at the Centre for + * Advanced Internet Architectures, Swinburne University, by Lawrence Stewart + * and James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* list of available cc algorithms on the current system */ +struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); + +struct rwlock cc_list_lock; + +/* create a struct to point to our newreno functions */ +struct cc_algo newreno_cc_algo = { + .name = "newreno", + .init = newreno_init, + .deinit = NULL, + .cwnd_init = newreno_cwnd_init, + .ack_received = newreno_ack_received, + .pre_fr = newreno_pre_fr, + .post_fr = newreno_post_fr, + .after_idle = newreno_after_idle, + .after_timeout = newreno_after_timeout +}; + +/* the system wide default cc algorithm */ +char cc_algorithm[TCP_CA_NAME_MAX]; + +/* + * sysctl handler that allows the default cc algorithm for the system to be + * viewed and changed + */ +static int +cc_default_algorithm(SYSCTL_HANDLER_ARGS) +{ + struct cc_algo *funcs; + + if (req->newptr == NULL) + goto skip; + + CC_LIST_RLOCK(); + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (strncmp((char *)req->newptr, funcs->name, TCP_CA_NAME_MAX) == 0) + goto reorder; + } + CC_LIST_RUNLOCK(); + + return 1; + +reorder: + /* + * Make the selected system default cc algorithm + * the first element in the list if it isn't already + */ + CC_LIST_RUNLOCK(); + CC_LIST_WLOCK(); + if (funcs != STAILQ_FIRST(&cc_list)) { + STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); + STAILQ_INSERT_HEAD(&cc_list, funcs, entries); + } + CC_LIST_WUNLOCK(); + +skip: + return sysctl_handle_string(oidp, arg1, arg2, req); +} + +/* + * sysctl handler that displays the available cc algorithms as a read + * only value + */ +static int +cc_list_available(SYSCTL_HANDLER_ARGS) +{ + struct cc_algo *algo; + int error = 0, first = 1; + struct sbuf *s = NULL; + + if ((s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND)) == NULL) + return -1; + + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) { + error = sbuf_printf(s, (first) ? "%s" : ", %s", algo->name); + if (error != 0) + break; + first = 0; + } + CC_LIST_RUNLOCK(); + + if (!error) { + sbuf_finish(s); + error = sysctl_handle_string(oidp, sbuf_data(s), 1, req); + } + + sbuf_delete(s); + return error; +} + +/* + * Initialise cc on system boot + */ +void +cc_init() +{ + /* initialise the lock that will protect read/write access to our linked list */ + CC_LIST_LOCK_INIT(); + + /* initilize list of cc algorithms */ + STAILQ_INIT(&cc_list); + + /* add newreno to the list of available algorithms */ + cc_register_algorithm(&newreno_cc_algo); + + /* set newreno to the system default */ + strlcpy(cc_algorithm, newreno_cc_algo.name, TCP_CA_NAME_MAX); +} + +/* + * Returns 1 on success, 0 on failure + */ +int +cc_deregister_algorithm(struct cc_algo *remove_cc) +{ + struct cc_algo *funcs, *tmpfuncs; + register struct tcpcb *tp = NULL; + register struct inpcb *inp = NULL; + int success = 0; + + /* remove the algorithm from the list available to the system */ + CC_LIST_RLOCK(); + STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { + if (funcs == remove_cc) { + if (CC_LIST_TRY_WLOCK()) { + /* if this algorithm is the system default, reset the default to newreno */ + if (strncmp(cc_algorithm, remove_cc->name, TCP_CA_NAME_MAX) == 0) + snprintf(cc_algorithm, TCP_CA_NAME_MAX, "%s", newreno_cc_algo.name); + + STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); + success = 1; + CC_LIST_W2RLOCK(); + } + break; + } + } + CC_LIST_RUNLOCK(); + + if (success) { + /* + * check all active control blocks and change any that are using this + * algorithm back to newreno. If the algorithm that was in use requires + * deinit code to be run, call it + */ + INP_INFO_RLOCK(&V_tcbinfo); + LIST_FOREACH(inp, &V_tcb, inp_list) { + /* skip tcptw structs */ + if (inp->inp_vflag & INP_TIMEWAIT) + continue; + INP_WLOCK(inp); + if ((tp = intotcpcb(inp)) != NULL) { + if (strncmp(CC_ALGO(tp)->name, remove_cc->name, TCP_CA_NAME_MAX) == 0 ) { + tmpfuncs = CC_ALGO(tp); + CC_ALGO(tp) = &newreno_cc_algo; + /* + * XXX: We should stall here until + * we're sure the tcb has stopped + * using the deregistered algo's functions... + * Not sure how to do that yet! + */ + if(CC_ALGO(tp)->init != NULL) + CC_ALGO(tp)->init(tp); + if (tmpfuncs->deinit != NULL) + tmpfuncs->deinit(tp); + } + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_tcbinfo); + } + + return success; +} + +int +cc_register_algorithm(struct cc_algo *add_cc) +{ + CC_LIST_WLOCK(); + STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); + CC_LIST_WUNLOCK(); + return 1; +} + +/* + * NEW RENO + */ + +int +newreno_init(struct tcpcb *tp) +{ + return 0; +} + +/* + * update ssthresh to approx 1/2 of cwnd + */ +void +newreno_ssthresh_update(struct tcpcb *tp) +{ + u_int win; + + /* reset ssthresh */ + win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + + if (win < 2) + win = 2; + + tp->snd_ssthresh = win * tp->t_maxseg; +} + +/* + * initial cwnd at the start of a connection + * if there is a hostcache entry for the foreign host, base cwnd on that + * if rfc3390 is enabled, set cwnd to approx 4 MSS as recommended + * otherwise use the sysctl variables configured by the administrator + */ +void +newreno_cwnd_init(struct tcpcb *tp) +{ + struct hc_metrics_lite metrics; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. + */ + + tcp_hc_get(&inp->inp_inc, &metrics); + +#define TCP_METRICS_CWND +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(tp->t_maxseg, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif + if (V_tcp_do_rfc3390) + tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); +#ifdef INET6 + else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && in_localaddr(inp->inp_faddr))) +#else + else if (in_localaddr(inp->inp_faddr)) +#endif + tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz; +} + +/* + * increase cwnd on receipt of a successful ACK + * if cwnd <= ssthresh, increases by 1 MSS per ACK + * if cwnd > ssthresh, increase by ~1 MSS per RTT + */ +void +newreno_ack_received(struct tcpcb *tp, struct tcphdr *th) +{ + u_int cw = tp->snd_cwnd; + u_int incr = tp->t_maxseg; + + /* + * If cwnd <= ssthresh, open exponentially (maxseg per packet). + * Otherwise, open linearly (approx. maxseg per RTT + * i.e. maxseg^2 / cwnd per ACK received). + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte + * to avoid capping cwnd (as suggested in RFC2581). + */ + if (cw > tp->snd_ssthresh) + incr = max((incr * incr / cw), 1); + + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); +} + +/* + * update the value of ssthresh before entering FR + */ +void +newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th) +{ + newreno_ssthresh_update(tp); +} + +/* + * decrease the cwnd in response to packet loss or a transmit timeout. + * th can be null, in which case cwnd will be set according to reno instead + * of new reno. + */ +void +newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) +{ + /* + * Out of fast recovery. + * Window inflation should have left us + * with approximately snd_ssthresh + * outstanding data. + * But in case we would be inclined to + * send a burst, better to do it via + * the slow start mechanism. + */ + if (th && SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; +} + +/* + * if a connection has been idle for a while and more data is ready to be sent, + * reset cwnd + */ +void +newreno_after_idle(struct tcpcb *tp) +{ + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + int ss = V_ss_fltsz; + +#ifdef INET6 + if (isipv6) { + if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) + ss = V_ss_fltsz_local; + } else +#endif /* INET6 */ + + if (in_localaddr(tp->t_inpcb->inp_faddr)) + ss = V_ss_fltsz_local; + + tp->snd_cwnd = tp->t_maxseg * ss; +} + +/* + * reset the cwnd after a transmission timeout. + */ +void +newreno_after_timeout(struct tcpcb *tp) +{ + newreno_ssthresh_update(tp); + + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + tp->snd_cwnd = tp->t_maxseg; +} + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, + "congestion control related settings"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, + &cc_algorithm, sizeof(cc_algorithm), cc_default_algorithm, "A", + "default congestion control algorithm"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, cc_list_available, "A", + "list available congestion control algorithms"); Index: netinet/cc.h =================================================================== --- netinet/cc.h (.../head/sys) (revision 0) +++ netinet/cc.h (.../projects/tcp_cc_8.x/sys) (revision 190777) @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_CC_H_ +#define _NETINET_CC_H_ + +/* Needed for TCP_CA_NAME_MAX define which lives in tcp.h for compat reasons */ +#include + +/* + * Global CC vars + */ +extern STAILQ_HEAD(cc_head, cc_algo) cc_list; +extern char cc_algorithm[]; +extern const int tcprexmtthresh; +extern struct cc_algo newreno_cc_algo; + +/* + * Define the new net.inet.tcp.cc sysctl tree + */ +SYSCTL_DECL(_net_inet_tcp_cc); + +/* + * CC housekeeping functions + */ +void cc_init(void); +int cc_register_algorithm(struct cc_algo *add_cc); +int cc_deregister_algorithm(struct cc_algo *remove_cc); + +/* + * NewReno CC functions + */ +int newreno_init(struct tcpcb *tp); +void newreno_cwnd_init(struct tcpcb *tp); +void newreno_ack_received(struct tcpcb *tp, struct tcphdr *th); +void newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); +void newreno_after_idle(struct tcpcb *tp); +void newreno_after_timeout(struct tcpcb *tp); +void newreno_ssthresh_update(struct tcpcb *tp); + +/* + * Structure to hold function pointers to the functions responsible + * for congestion control. Based on similar structure in the SCTP stack + */ +struct cc_algo { + char name[TCP_CA_NAME_MAX]; + + /* init the congestion algorithm for the specified control block */ + int (*init) (struct tcpcb *tp); + + /* deinit the congestion algorithm for the specified control block */ + void (*deinit) (struct tcpcb *tp); + + /* initilise cwnd at the start of a connection */ + void (*cwnd_init) (struct tcpcb *tp); + + /* called on the receipt of a valid ack */ + void (*ack_received) (struct tcpcb *tp, struct tcphdr *th); + + /* called before entering FR */ + void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* after exiting FR */ + void (*post_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* perform tasks when data transfer resumes after an idle period */ + void (*after_idle) (struct tcpcb *tp); + + /* perform tasks when the connection's retransmit timer expires */ + void (*after_timeout) (struct tcpcb *tp); + + STAILQ_ENTRY(cc_algo) entries; +}; + +#define CC_ALGO(tp) ((tp)->cc_algo) +#define CC_DATA(tp) ((tp)->cc_data) + +extern struct rwlock cc_list_lock; +#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") +#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) +#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) +#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) +#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) +#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) +#define CC_LIST_TRY_WLOCK() rw_try_upgrade(&cc_list_lock) +#define CC_LIST_W2RLOCK() rw_downgrade(&cc_list_lock) + +#endif /* _NETINET_CC_H_ */