--- Copyright (c) 2007-2008, Centre for Advanced Internet Architectures --- Swinburne University of Technology, Melbourne, Australia --- (CRICOS number 00111D). --- Copyright (c) 2008-2010, Lawrence Stewart --- All rights reserved. --- --- Redistribution and use in source and binary forms, with or without --- modification, are permitted provided that the following conditions --- are met: --- 1. Redistributions of source code must retain the above copyright --- notice, this list of conditions and the following disclaimer. --- 2. Redistributions in binary form must reproduce the above copyright --- notice, this list of conditions and the following disclaimer in the --- documentation and/or other materials provided with the distribution. --- 3. The names of the authors, "Swinburne University of Technology" and the --- "Centre for Advanced Internet Architectures" may not be used to endorse --- or promote products derived from this software without specific --- prior written permission. --- --- THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS \`\`AS IS'' AND --- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --- ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE --- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS --- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) --- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT --- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY --- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF --- SUCH DAMAGE. --- --- CAIA Modular Congestion Control v0.9.4 and Khelp Framework v0.1.0 Bundle --- --- This patch was created against revision 203910 of the FreeBSD 9-CURRENT --- Subversion source tree by running the following command and cleaning up the --- output to remove irrelevant parts: --- --- svn diff http://svn.freebsd.org/base/head/sys@203910 \ --- http://svn.freebsd.org/base/projects/tcp_cc_head/sys@203947 --- --- To obtain the correct revision of the FreeBSD source tree that this patch --- applies to, and store it in the local directory "/path/to/src", run: --- --- svn co -r 203910 http://svn.freebsd.org/base/head --- --- Make sure the base system you are installing onto is already running --- FreeBSD 9.x before continuing. --- --- Issuing the following commands will result in a running modular congestion --- control and Khelp framework capable system: --- --- cd /path/to/src/sys --- patch -p0 < /path/to/caia_modularcc_v0.9.4_khelp_v0.1.0_bundle_9.x.r203910.patch --- cd /path/to/src/ --- make buildworld buildkernel installkernel installworld --- mergemaster -iF -m /path/to/src --- reboot --- --- The modular congestion control patch was first released in 2007 by --- James Healy and Lawrence Stewart whilst working on the NewTCP research --- project at Swinburne University's Centre for Advanced Internet --- Architectures, Melbourne, Australia, which was made possible in part by a --- grant from the Cisco University Research Program Fund at --- Community Foundation Silicon Valley. More details are available at: --- http://caia.swin.edu.au/urp/newtcp/ --- --- Lawrence Stewart has continued development of this work since 2008 in his --- spare time. --- --- The Khelp framework patch was first released in 2010 by Lawrence Stewart --- whilst studying at Swinburne University's Centre for Advanced Internet --- Architectures, Melbourne, Australia. The work is released as part --- of the NewTCP research project. More details are available at: --- http://caia.swin.edu.au/urp/newtcp/ --- --- Lawrence Stewart is currently the sole maintainer of both patches. --- All contact regarding this bundle patch should be directed to him --- via email: lastewart@swin.edu.au --- Index: conf/files =================================================================== --- conf/files (.../head/sys) (revision 203910) +++ conf/files (.../projects/tcp_cc_head/sys) (revision 203947) @@ -2465,6 +2465,8 @@ netinet/accf_data.c optional accept_filter_data inet netinet/accf_dns.c optional accept_filter_dns inet netinet/accf_http.c optional accept_filter_http inet +netinet/helper.c optional inet +netinet/hhooks.c optional inet netinet/if_atm.c optional atm netinet/if_ether.c optional inet ether netinet/igmp.c optional inet @@ -2498,6 +2500,8 @@ netinet/ip_options.c optional inet netinet/ip_output.c optional inet netinet/raw_ip.c optional inet +netinet/cc.c optional inet +netinet/cc_newreno.c optional inet netinet/sctp_asconf.c optional inet sctp netinet/sctp_auth.c optional inet sctp netinet/sctp_bsd_addr.c optional inet sctp Index: netinet/tcp_input.c =================================================================== --- netinet/tcp_input.c (.../head/sys) (revision 203910) +++ netinet/tcp_input.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -56,6 +56,8 @@ #define TCPSTATES /* for logging */ +#include +#include #include #include #include @@ -75,7 +78,6 @@ #include #include #include -#include #include #include #include @@ -96,7 +98,7 @@ #include -static const int tcprexmtthresh = 3; +const int tcprexmtthresh = 3; VNET_DEFINE(struct tcpstat, tcpstat); VNET_DEFINE(int, blackhole); @@ -194,8 +196,10 @@ struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline - tcp_congestion_exp(struct tcpcb *); +static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th); +static void inline cc_conn_init(struct tcpcb *tp); +static void inline cc_pre_fr(struct tcpcb *tp, struct tcphdr *th); +static void inline cc_post_fr(struct tcpcb *tp, struct tcphdr *th); /* * Kernel module interface for updating tcpstat. The argument is an index @@ -211,22 +215,126 @@ (*((u_long *)&V_tcpstat + statnum))++; } +/* + * CC wrapper hook functions + */ static void inline -tcp_congestion_exp(struct tcpcb *tp) +cc_ack_received(struct tcpcb *tp, struct tcphdr *th) { - u_int win; - - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (!IN_FASTRECOVERY(tp) && CC_ALGO(tp)->ack_received != NULL) + CC_ALGO(tp)->ack_received(tp, th); +} + +static void inline +cc_conn_init(struct tcpcb *tp) +{ + struct hc_metrics_lite metrics; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + int rtt; +#ifdef INET6 + int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; +#endif + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tcp_hc_get(&inp->inp_inc, &metrics); + + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + tp->t_srtt = rtt; + tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + TCPSTAT_INC(tcps_usedrtt); + if (metrics.rmx_rttvar) { + tp->t_rttvar = metrics.rmx_rttvar; + TCPSTAT_INC(tcps_usedrttvar); + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + if (metrics.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); + TCPSTAT_INC(tcps_usedssthresh); + } + if (metrics.rmx_bandwidth) + tp->snd_bandwidth = metrics.rmx_bandwidth; + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. + */ +#define TCP_METRICS_CWND +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(tp->t_maxseg, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif + if (V_tcp_do_rfc3390) + tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, +4380)); +#ifdef INET6 + else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && in_localaddr(inp->inp_faddr))) +#else + else if (in_localaddr(inp->inp_faddr)) +#endif + tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz; + + if (CC_ALGO(tp)->conn_init != NULL) + CC_ALGO(tp)->conn_init(tp); +} + +static void inline +cc_pre_fr(struct tcpcb *tp, struct tcphdr *th) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp, th); + ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } +static void inline +cc_post_fr(struct tcpcb *tp, struct tcphdr *th) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->post_fr != NULL) + CC_ALGO(tp)->post_fr(tp, th); +} + /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ @@ -1067,7 +1175,7 @@ int rstreason, todrop, win; u_long tiwin; struct tcpopt to; - + struct tcp_hhook_data hhook_data; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1157,7 +1265,7 @@ if ((thflags & TH_ECE) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { TCPSTAT_INC(tcps_ecn_rcwnd); - tcp_congestion_exp(tp); + cc_pre_fr(tp, th); } } @@ -1253,14 +1361,9 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!V_tcp_do_newreno && - !(tp->t_flags & TF_SACK_PERMIT) && - tp->t_dupacks < tcprexmtthresh) || - ((V_tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && - (to.to_flags & TOF_SACK) == 0 && - TAILQ_EMPTY(&tp->snd_holes)))) { + !IN_FASTRECOVERY(tp) && + (to.to_flags & TOF_SACK) == 0 && + TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ @@ -1315,7 +1418,7 @@ ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); - acked = th->th_ack - tp->snd_una; + acked = BYTES_ACKED(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); @@ -1581,6 +1684,7 @@ thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; + cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } } else { @@ -1984,6 +2088,7 @@ tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; + cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } /* @@ -2014,10 +2119,21 @@ TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } + hhook_data.new_sacked_bytes = 0; if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || - !TAILQ_EMPTY(&tp->snd_holes))) + !TAILQ_EMPTY(&tp->snd_holes))) { tcp_sack_doack(tp, &to, th->th_ack); + /* XXXDH: should only be one if a productive SACK */ + hhook_data.new_sacked_bytes = 1; + } + + hhook_data.tp = tp; + hhook_data.th = th; + hhook_data.to = &to; + run_hhooks(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN, + &hhook_data, tp->hdbs); + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { TCPSTAT_INC(tcps_rcvdupack); @@ -2052,9 +2168,7 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((V_tcp_do_newreno || - (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp))) { + IN_FASTRECOVERY(tp)) { if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp)) { int awnd; @@ -2091,15 +2205,15 @@ tp->t_dupacks = 0; break; } - } else if (V_tcp_do_newreno || - V_tcp_do_ecn) { + } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } - tcp_congestion_exp(tp); + + cc_pre_fr(tp, th); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { @@ -2164,37 +2278,14 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { - if (IN_FASTRECOVERY(tp)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { - if (tp->t_flags & TF_SACK_PERMIT) - tcp_sack_partialack(tp, th); - else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - } - } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + if (IN_FASTRECOVERY(tp)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->t_flags & TF_SACK_PERMIT) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + } else + cc_post_fr(tp, th); } tp->t_dupacks = 0; /* @@ -2225,7 +2316,7 @@ ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); - acked = th->th_ack - tp->snd_una; + acked = BYTES_ACKED(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); @@ -2294,60 +2385,11 @@ /* * When new data is acked, open the congestion window. - * Method depends on which congestion control state we're - * in (slow start or cong avoid) and if ABC (RFC 3465) is - * enabled. - * - * slow start: cwnd <= ssthresh - * cong avoid: cwnd > ssthresh - * - * slow start and ABC (RFC 3465): - * Grow cwnd exponentially by the amount of data - * ACKed capping the max increment per ACK to - * (abc_l_var * maxseg) bytes. - * - * slow start without ABC (RFC 2581): - * Grow cwnd exponentially by maxseg per ACK. - * - * cong avoid and ABC (RFC 3465): - * Grow cwnd linearly by maxseg per RTT for each - * cwnd worth of ACKed data. - * - * cong avoid without ABC (RFC 2581): - * Grow cwnd linearly by approximately maxseg per RTT using - * maxseg^2 / cwnd per ACK as the increment. - * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to - * avoid capping cwnd. + * The specifics of how this is achieved are up to the + * congestion control algorithm in use for this connection. */ - if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || - !IN_FASTRECOVERY(tp)) { - u_int cw = tp->snd_cwnd; - u_int incr = tp->t_maxseg; - /* In congestion avoidance? */ - if (cw > tp->snd_ssthresh) { - if (V_tcp_do_rfc3465) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked >= tp->snd_cwnd) - tp->t_bytes_acked -= cw; - else - incr = 0; - } - else - incr = max((incr * incr / cw), 1); - /* - * In slow-start with ABC enabled and no RTO in sight? - * (Must not use abc_l_var > 1 if slow starting after an - * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt == - * snd_max check is sufficient to handle this). - */ - } else if (V_tcp_do_rfc3465 && - tp->snd_nxt == tp->snd_max) - incr = min(acked, - V_tcp_abc_l_var * tp->t_maxseg); - /* ABC is on by default, so (incr == 0) frequently. */ - if (incr > 0) - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); - } + cc_ack_received(tp, th); + SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; @@ -2361,13 +2403,11 @@ /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ - if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - !IN_FASTRECOVERY(tp) && + if (!IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && - IN_FASTRECOVERY(tp) && + if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_FASTRECOVERY(tp); tp->t_bytes_acked = 0; @@ -3235,24 +3275,19 @@ void tcp_mss(struct tcpcb *tp, int offer) { - int rtt, mss; + int mss; u_long bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; int mtuflags = 0; -#ifdef INET6 - int isipv6; -#endif + KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); tcp_mss_update(tp, offer, &metrics, &mtuflags); mss = tp->t_maxseg; inp = tp->t_inpcb; -#ifdef INET6 - isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; -#endif /* * If there's a pipesize, change the socket buffer to that size, @@ -3292,74 +3327,7 @@ (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); - /* - * While we're here, check the others too. - */ - if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { - tp->t_srtt = rtt; - tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - TCPSTAT_INC(tcps_usedrtt); - if (metrics.rmx_rttvar) { - tp->t_rttvar = metrics.rmx_rttvar; - TCPSTAT_INC(tcps_usedrttvar); - } else { - /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; - } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); - } - if (metrics.rmx_ssthresh) { - /* - * There's some sort of gateway or interface - * buffer limit on the path. Use this to set - * the slow start threshhold, but set the - * threshold to no less than 2*mss. - */ - tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); - TCPSTAT_INC(tcps_usedssthresh); - } - if (metrics.rmx_bandwidth) - tp->snd_bandwidth = metrics.rmx_bandwidth; - /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - * - * Extend this so we cache the cwnd too and retrieve it here. - * Make cwnd even bigger than RFC3390 suggests but only if we - * have previous experience with the remote host. Be careful - * not make cwnd bigger than remote receive window or our own - * send socket buffer. Maybe put some additional upper bound - * on the retrieved cwnd. Should do incremental updates to - * hostcache when cwnd collapses so next connection doesn't - * overloads the path again. - * - * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. - * We currently check only in syncache_socket for that. - */ -#define TCP_METRICS_CWND -#ifdef TCP_METRICS_CWND - if (metrics.rmx_cwnd) - tp->snd_cwnd = max(mss, - min(metrics.rmx_cwnd / 2, - min(tp->snd_wnd, so->so_snd.sb_hiwat))); - else -#endif - if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); -#ifdef INET6 - else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr))) -#else - else if (in_localaddr(inp->inp_faddr)) -#endif - tp->snd_cwnd = mss * V_ss_fltsz_local; - else - tp->snd_cwnd = mss * V_ss_fltsz; - /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) tp->t_flags |= TF_TSO; @@ -3422,7 +3390,7 @@ * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ - tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(tp, th); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; @@ -3432,8 +3400,8 @@ * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ - if (tp->snd_cwnd > th->th_ack - tp->snd_una) - tp->snd_cwnd -= th->th_ack - tp->snd_una; + if (tp->snd_cwnd > BYTES_ACKED(tp, th)) + tp->snd_cwnd -= BYTES_ACKED(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; Index: netinet/tcp_subr.c =================================================================== --- netinet/tcp_subr.c (.../head/sys) (revision 203910) +++ netinet/tcp_subr.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -62,6 +64,9 @@ #include #include +#include +#include +#include #include #include #include @@ -80,7 +85,6 @@ #include #endif #include -#include #include #include #include @@ -290,6 +294,7 @@ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; + struct helper_dblocks hdbs; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); @@ -374,6 +379,15 @@ V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + if (register_hhook_head(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN, + HHOOK_NOWAIT) != 0) + printf("%s: WARNING: unable to register helper hook\n", __func__); + if (register_hhook_head(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_OUT, + HHOOK_NOWAIT) != 0) + printf("%s: WARNING: unable to register helper hook\n", __func__); + + cc_init(); + TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); @@ -749,6 +763,28 @@ if (tm == NULL) return (NULL); tp = &tm->tcb; + + /* + * Use the current system default CC algorithm. + */ + CC_LIST_RLOCK(); + KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); + CC_ALGO(tp) = CC_DEFAULT(); + CC_LIST_RUNLOCK(); + + if (CC_ALGO(tp)->cb_init != NULL) + if (CC_ALGO(tp)->cb_init(tp) > 0) { + uma_zfree(V_tcpcb_zone, tm); + return (NULL); + } + + tp->hdbs = &tm->hdbs; + tp->hdbs->class = HELPER_CLASS_TCP; + if (init_helper_dblocks(tp->hdbs)) { + uma_zfree(V_tcpcb_zone, tm); + return (NULL); + } + #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif @@ -912,8 +948,15 @@ } /* Disconnect offload device, if any. */ tcp_offload_detach(tp); - tcp_free_sackholes(tp); + + /* Allow the CC algorithm to clean up after itself. */ + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp); + + destroy_helper_dblocks(tp->hdbs); + + CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); Index: netinet/tcp_timer.c =================================================================== --- netinet/tcp_timer.c (.../head/sys) (revision 203910) +++ netinet/tcp_timer.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,6 @@ #include #endif #include -#include #include #include #include @@ -117,7 +117,25 @@ /* max idle time in persist */ int tcp_maxidle; +static void inline cc_after_timeout(struct tcpcb *tp); + /* + * CC wrapper hook functions + */ +static void inline +cc_after_timeout(struct tcpcb *tp) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->after_timeout != NULL) + CC_ALGO(tp)->after_timeout(tp); + + tp->t_dupacks = 0; + EXIT_FASTRECOVERY(tp); + tp->t_bytes_acked = 0; +} + +/* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. @@ -547,40 +565,9 @@ * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; - /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) - */ - { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_dupacks = 0; - } - EXIT_FASTRECOVERY(tp); - tp->t_bytes_acked = 0; + + cc_after_timeout(tp); + (void) tcp_output(tp); out: Index: netinet/tcp_var.h =================================================================== --- netinet/tcp_var.h (.../head/sys) (revision 203910) +++ netinet/tcp_var.h (.../projects/tcp_cc_head/sys) (revision 203947) @@ -74,7 +76,7 @@ struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; - + tcp_seq last_sack_ack; /* Last sack block acked with current pkt - used for enhanced RTT calculations*/ int ispare; /* explicit pad for 64bit alignment */ uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ }; @@ -199,10 +201,12 @@ struct toe_usrreqs *t_tu; /* offload operations vector */ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ - int t_ispare; /* explicit pad for 64bit alignment */ void *t_pspare2[6]; /* 2 CC / 4 TBD */ uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */ + struct cc_algo *cc_algo; /* the algorithm that will manage congestion control*/ + void *cc_data; /* pointer to a struct containing data required for the cc algorithm in use */ + struct helper_dblocks *hdbs; }; /* @@ -239,7 +243,25 @@ #define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY +#define BYTES_ACKED(tp, th) (th->th_ack - tp->snd_una) + /* + * TCP specific helper hook point identifiers. + */ +#define HHOOK_TCP_ESTABLISHED_IN 1 +#define HHOOK_TCP_ESTABLISHED_OUT 2 + +struct tcp_hhook_data { + struct tcpcb *tp; + struct tcphdr *th; + struct tcpopt *to; + long len; + int tso; + tcp_seq curack; + int new_sacked_bytes; +}; + +/* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 Index: netinet/tcp_output.c =================================================================== --- netinet/tcp_output.c (.../head/sys) (revision 203910) +++ netinet/tcp_output.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -53,6 +53,8 @@ #include #include +#include +#include #include #include #include @@ -64,7 +66,6 @@ #include #include #endif -#include #define TCPOUTFLAGS #include #include @@ -108,10 +109,6 @@ CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1, "Slow start flight size for local networks"); -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - &VNET_NAME(tcp_do_newreno), 0, - "Enable NewReno Algorithms"); - SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); @@ -128,8 +125,21 @@ &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); +static void inline cc_after_idle(struct tcpcb *tp); /* + * CC wrapper hook functions + */ +static void inline +cc_after_idle(struct tcpcb *tp) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->after_idle != NULL) + CC_ALGO(tp)->after_idle(tp); +} + +/* * Tcp output routine: figure out what should be sent and send it. */ int @@ -152,6 +162,7 @@ struct sackhole *p; int tso = 0; struct tcpopt to; + struct tcp_hhook_data hhook_data; #if 0 int maxburst = TCP_MAXBURST; #endif @@ -171,26 +182,8 @@ * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) { - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. - */ - int ss = V_ss_fltsz; -#ifdef INET6 - if (isipv6) { - if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = V_ss_fltsz_local; - } else -#endif /* INET6 */ - if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = V_ss_fltsz_local; - tp->snd_cwnd = tp->t_maxseg * ss; - } + if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) + cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { @@ -1121,6 +1114,15 @@ tp->snd_max = tp->snd_nxt + len; } + hhook_data.th = th; + hhook_data.tp = tp; + hhook_data.to = &to; + hhook_data.len = len; + hhook_data.tso = tso; + run_hhooks(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_OUT, &hhook_data, + tp->hdbs); + + #ifdef TCPDEBUG /* * Trace. Index: netinet/helper.c =================================================================== --- netinet/helper.c (.../head/sys) (revision 0) +++ netinet/helper.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,277 @@ +/*- + * Copyright (c) 2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, + * made possible in part by a grant from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +static struct rwlock helper_list_lock; +RW_SYSINIT(helperlistlock, &helper_list_lock, "helper list lock"); + +static STAILQ_HEAD(helper_head, helper) helpers = STAILQ_HEAD_INITIALIZER(helpers); + +static int num_dblocks = 0; + +/* Monotonically increasing ID assigned to helpers on registration. */ +static int32_t helper_id = 0; + +static struct helper * get_helper(int32_t id); + +/* + * Public KPI functions. + */ +int +init_helper_dblocks(struct helper_dblocks *hdbs) +{ + struct helper *h; + struct helper_dblock *dblock; + int i = 0, error = 0; + + KASSERT(hdbs != NULL, ("struct helper_dblocks not initialised!")); + + HELPER_LIST_RLOCK(); + + if (num_dblocks == 0) { + HELPER_LIST_RUNLOCK(); + return (0); + } + + /* XXXLAS: Should only allocate for helpers of the appropriate class. */ + hdbs->blocks = malloc(num_dblocks * sizeof(struct helper_dblock), M_HELPER, + M_NOWAIT | M_ZERO); + + if (hdbs->blocks != NULL) { + /*printf("Malloced ptr %p for %d data blocks\n", hdbs->blocks, + num_dblocks);*/ + STAILQ_FOREACH(h, &helpers, h_next) { + if (h->h_flags & HELPER_NEEDS_DBLOCK) { + dblock = hdbs->blocks+i; + /*printf("Current dblock ptr: %p\n", dblock);*/ + dblock->hd_block = uma_zalloc(h->h_zone, + M_NOWAIT); + /* + if (dblock[i]->block == NULL) { + XXX: Free all previous dblocks. + error = ENOMEM + break; + } + */ + dblock->hd_id = h->h_id; + /*printf("dblock[%d]: id=%d, block=%p\n", i, + dblock->hd_id, dblock->hd_block);*/ + i++; + refcount_acquire(&h->h_refcount); + } + } + hdbs->nblocks = i; + } else + error = ENOMEM; + + HELPER_LIST_RUNLOCK(); + return (error); +} + +int +destroy_helper_dblocks(struct helper_dblocks *hdbs) +{ + struct helper *h; + int32_t nblocks = hdbs->nblocks; + + HELPER_LIST_WLOCK(); + + for (nblocks--; nblocks >= 0; nblocks--) { + if ((h = get_helper(hdbs->blocks[nblocks].hd_id)) != NULL) { + refcount_release(&h->h_refcount); + /*printf("destroy() freeing hdbs->blocks[%d] with ptr %p\n", + nblocks, hdbs->blocks[nblocks].hd_block);*/ + uma_zfree(h->h_zone, hdbs->blocks[nblocks].hd_block); + } + } + + HELPER_LIST_WUNLOCK(); + free(hdbs->blocks, M_HELPER); + return (0); +} + +int +register_helper(struct helper *h) +{ + HELPER_LIST_WLOCK(); + if (h->h_flags | HELPER_NEEDS_DBLOCK) + num_dblocks++; + + refcount_init(&h->h_refcount, 0); + h->h_id = helper_id++; + STAILQ_INSERT_TAIL(&helpers, h, h_next); + HELPER_LIST_WUNLOCK(); + printf("Registered \"%s\" helper (mem %p)\n", h->h_name, h); + return (0); +} + +int +deregister_helper(struct helper *h) +{ + int error = 0; + + /* + HHOOK_WLOCK + Remove this helper's hooks + HHOOK_WUNLOCK + */ + + HELPER_LIST_WLOCK(); + if (h->h_refcount > 0) + error = EBUSY; + + if (!error) { + STAILQ_REMOVE(&helpers, h, helper, h_next); + if (h->h_flags | HELPER_NEEDS_DBLOCK) + num_dblocks--; + printf("Deregistered \"%s\" helper (mem %p)\n", h->h_name, h); + } + HELPER_LIST_WUNLOCK(); + return (error); +} + +int32_t +get_helper_id(char *hname) +{ + struct helper *h; + int32_t id = -1; + + HELPER_LIST_RLOCK(); + STAILQ_FOREACH(h, &helpers, h_next) { + if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) { + id = h->h_id; + break; + } + } + HELPER_LIST_RUNLOCK(); + return (id); +} + +void * +get_helper_dblock(struct helper_dblocks *hdbs, int32_t id) +{ + uint32_t nblocks = hdbs->nblocks; + + for (nblocks--; nblocks >= 0; nblocks--) { + if (hdbs->blocks[nblocks].hd_id == id) + return (hdbs->blocks[nblocks].hd_block); + } + return (NULL); +} + +/* + * Private KPI functions. + */ +static struct helper * +get_helper(int32_t id) +{ + struct helper *h; + + HELPER_LIST_LOCK_ASSERT(); + + STAILQ_FOREACH(h, &helpers, h_next) { + if (h->h_id == id) + return (h); + } + return (NULL); +} + +/* + * Handles kld related events. Returns 0 on success, non-zero on failure. + */ +int +helper_modevent(module_t mod, int event_type, void *data) +{ + int error = 0; + struct helper_modevent_data *hmd = (struct helper_modevent_data *)data; + + switch(event_type) { + case MOD_LOAD: + if (hmd->helper->h_flags & HELPER_NEEDS_DBLOCK) { + if (hmd->uma_zsize <= 0) { + printf("Use DECLARE_HELPER_UMA() instead!\n"); + error = EDOOFUS; + break; + } + hmd->helper->h_zone = + uma_zcreate(hmd->name, hmd->uma_zsize, + hmd->umactor, hmd->umadtor, NULL, NULL, 0, + 0); + if (hmd->helper->h_zone == NULL) { + error = ENOMEM; + break; + } + } + strlcpy(hmd->helper->h_name, hmd->name, + HELPER_NAME_MAXLEN); + if (hmd->helper->mod_init != NULL) + error = hmd->helper->mod_init(); + if (!error) + error = register_helper(hmd->helper); + break; + + case MOD_QUIESCE: + error = deregister_helper(hmd->helper); + if (!error) { + uma_zdestroy(hmd->helper->h_zone); + if (hmd->helper->mod_destroy != NULL) + hmd->helper->mod_destroy(); + } else + printf("Helper's refcount != 0, can't unload\n"); + break; + + case MOD_SHUTDOWN: + case MOD_UNLOAD: + break; + + default: + error = EINVAL; + break; + } + + return (error); +} Index: netinet/helper.h =================================================================== --- netinet/helper.h (.../head/sys) (revision 0) +++ netinet/helper.h (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, + * made possible in part by a grant from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_HELPER_H_ +#define _NETINET_HELPER_H_ + + +struct helper_dblock { + int32_t hd_id; + void *hd_block; +}; + +struct helper_dblocks { + struct helper_dblock *blocks; + int32_t nblocks; + uint32_t class; +}; + +struct helper { + int (*mod_init) (void); + int (*mod_destroy) (void); + uma_zone_t h_zone; +#define HELPER_NAME_MAXLEN 16 + char h_name[HELPER_NAME_MAXLEN]; + uint16_t h_flags; + uint32_t h_class; + int32_t h_id; + volatile uint32_t h_refcount; + STAILQ_ENTRY(helper) h_next; +}; + +/* Helper flags */ +#define HELPER_NEEDS_DBLOCK 0x0001 + +/* Helper classes */ +#define HELPER_CLASS_TCP 0x00000001 + +int init_helper_dblocks(struct helper_dblocks *hdbs); +int destroy_helper_dblocks(struct helper_dblocks *hdbs); +int register_helper(struct helper *h); +int deregister_helper(struct helper *h); +int32_t get_helper_id(char *hname); +void * get_helper_dblock(struct helper_dblocks *hdbs, int32_t id); + +#define HELPER_LIST_WLOCK() rw_wlock(&helper_list_lock) +#define HELPER_LIST_WUNLOCK() rw_wunlock(&helper_list_lock) +#define HELPER_LIST_RLOCK() rw_rlock(&helper_list_lock) +#define HELPER_LIST_RUNLOCK() rw_runlock(&helper_list_lock) +#define HELPER_LIST_LOCK_ASSERT() rw_assert(&helper_list_lock, RA_LOCKED) + +#endif /* _NETINET_HELPER_H_ */ Index: netinet/cc_module.h =================================================================== --- netinet/cc_module.h (.../head/sys) (revision 0) +++ netinet/cc_module.h (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2008-2009 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2009 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_CC_MODULE_H_ +#define _NETINET_CC_MODULE_H_ + +#define DECLARE_CC_MODULE(ccname, ccalgo) \ + static moduledata_t cc_##ccname = { \ + #ccname, \ + cc_modevent, \ + ccalgo \ + }; \ + DECLARE_MODULE(ccname, cc_##ccname, SI_SUB_PROTO_IFATTACHDOMAIN, \ + SI_ORDER_ANY) + +int cc_modevent(module_t mod, int type, void *data); + +#endif /* _NETINET_CC_MODULE_H_ */ Index: netinet/cc_newreno.c =================================================================== --- netinet/cc_newreno.c (.../head/sys) (revision 0) +++ netinet/cc_newreno.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. + * Copyright (c) 2007-2009 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2009 Lawrence Stewart + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * NewReno CC functions + */ +void newreno_ack_received(struct tcpcb *tp, struct tcphdr *th); +void newreno_ssthresh_update(struct tcpcb *tp, struct tcphdr *th); +void newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); +void newreno_after_idle(struct tcpcb *tp); +void newreno_after_timeout(struct tcpcb *tp); + +/* newreno cc function pointers */ +struct cc_algo newreno_cc_algo = { + .name = "newreno", + .ack_received = newreno_ack_received, + .pre_fr = newreno_ssthresh_update, + .post_fr = newreno_post_fr, + .after_idle = newreno_after_idle, + .after_timeout = newreno_after_timeout +}; + +/* + * increase cwnd on receipt of a successful ACK + * if cwnd <= ssthresh, increases by 1 MSS per ACK + * if cwnd > ssthresh, increase by ~1 MSS per RTT + */ +void +newreno_ack_received(struct tcpcb *tp, struct tcphdr *th) +{ + u_int cw = tp->snd_cwnd; + u_int incr = tp->t_maxseg; + + /* + * If cwnd <= ssthresh, open exponentially (maxseg per packet). + * Otherwise, open linearly (approx. maxseg per RTT + * i.e. maxseg^2 / cwnd per ACK received). + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte + * to avoid capping cwnd (as suggested in RFC2581). + */ + if (cw > tp->snd_ssthresh) + incr = max((incr * incr / cw), 1); + + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); +} + +/* + * update ssthresh to approx 1/2 of cwnd + * argument "th" is unsued but required so that the function can + * masquerade as a pre_fr hook function + */ +void +newreno_ssthresh_update(struct tcpcb *tp, struct tcphdr *th) +{ + u_int win; + + /* reset ssthresh */ + win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + + if (win < 2) + win = 2; + + tp->snd_ssthresh = win * tp->t_maxseg; +} + +/* + * decrease the cwnd in response to packet loss or a transmit timeout. + * th can be null, in which case cwnd will be set according to reno instead + * of new reno. + */ +void +newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) +{ + /* + * Out of fast recovery. + * Window inflation should have left us + * with approximately snd_ssthresh + * outstanding data. + * But in case we would be inclined to + * send a burst, better to do it via + * the slow start mechanism. + */ + if (th && SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; +} + +/* + * if a connection has been idle for a while and more data is ready to be sent, + * reset cwnd + */ +void +newreno_after_idle(struct tcpcb *tp) +{ + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + int ss = V_ss_fltsz; + +#ifdef INET6 + if (isipv6) { + if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) + ss = V_ss_fltsz_local; + } else +#endif /* INET6 */ + + if (in_localaddr(tp->t_inpcb->inp_faddr)) + ss = V_ss_fltsz_local; + + tp->snd_cwnd = tp->t_maxseg * ss; +} + +/* + * reset the cwnd after a transmission timeout. + */ +void +newreno_after_timeout(struct tcpcb *tp) +{ + newreno_ssthresh_update(tp, NULL); + + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + tp->snd_cwnd = tp->t_maxseg; +} + +DECLARE_CC_MODULE(newreno, &newreno_cc_algo); Index: netinet/tcp_sack.c =================================================================== --- netinet/tcp_sack.c (.../head/sys) (revision 203910) +++ netinet/tcp_sack.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -426,6 +426,7 @@ * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ + tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK @@ -577,7 +578,7 @@ tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ - if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2) + if ((BYTES_ACKED(tp, th) / tp->t_maxseg) > 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); Index: netinet/hhooks.c =================================================================== --- netinet/hhooks.c (.../head/sys) (revision 0) +++ netinet/hhooks.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,301 @@ +/*- + * Copyright (c) 2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, + * made possible in part by a grant from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +#define RLOCK_HHOOK_HEAD 0x01 +#define WLOCK_HHOOK_HEAD 0x02 + +MALLOC_DECLARE(M_HHOOK); +MALLOC_DEFINE(M_HHOOK, "helper hook related memory", "Blah"); + +struct hhook { + hhook_func_t h_func; + void *h_udata; + struct helper *h_helper; + STAILQ_ENTRY(hhook) h_next; +}; + +typedef STAILQ_HEAD(hhook_list, hhook) hhook_list_t; + +struct hhook_head { + int hh_type; + int hh_id; + int hh_nhooks; + hhook_list_t hh_hooks; + struct rmlock hh_lock; + LIST_ENTRY(hhook_head) hh_next; +}; + +LIST_HEAD(hhookheadhead, hhook_head); +VNET_DEFINE(struct hhookheadhead, hhook_head_list); +#define V_hhook_head_list VNET(hhook_head_list) + +static struct mtx hhook_head_list_lock; +MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock", + MTX_DEF); + +static struct hhook_head * get_hhook_head(int hhook_type, int hhook_id, + struct rm_priotracker* rmpt, int flags); + + +/* + * Public KPI functions + */ +int +register_hhook_head(int hhook_type, int hhook_id, int flags) +{ + struct hhook_head *hh; + + HHOOK_HEAD_LIST_LOCK(); + hh = get_hhook_head(hhook_type, hhook_id, NULL, 0); + + if (hh != NULL) + return (EEXIST); + + hh = malloc(sizeof(struct hhook_head), M_HHOOK, + M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT)); + + if (hh == NULL) + return (ENOMEM); + + printf("About to register hhook_head %p with type: %d and id: %d\n", hh, + hhook_type, hhook_id); + + hh->hh_type = hhook_type; + hh->hh_id = hhook_id; + hh->hh_nhooks = 0; + STAILQ_INIT(&hh->hh_hooks); + HHOOK_HEAD_LOCK_INIT(hh); + + LIST_INSERT_HEAD(&V_hhook_head_list, hh, hh_next); + HHOOK_HEAD_LIST_UNLOCK(); + return (0); +} + +int +deregister_hhook_head(int hhook_type, int hhook_id) +{ + struct hhook_head *hh; + struct hhook *tmp, *tmp2; + int error = 0; + + HHOOK_HEAD_LIST_LOCK(); + hh = get_hhook_head(hhook_type, hhook_id, NULL, WLOCK_HHOOK_HEAD); + + if (hh == NULL) + error = ENOENT; + else { + LIST_REMOVE(hh, hh_next); + + STAILQ_FOREACH_SAFE(tmp, &hh->hh_hooks, h_next, tmp2) { + free(tmp, M_HHOOK); + } + + HHOOK_HEAD_WUNLOCK(hh); + HHOOK_HEAD_LOCK_DESTROY(hh); + free(hh, M_HHOOK); + } + + HHOOK_HEAD_LIST_UNLOCK(); + return (error); +} + +int +register_hhook(int hhook_type, int hhook_id, struct helper *helper, + hhook_func_t hook, void *udata, int flags) +{ + struct hhook *h, *tmp; + struct hhook_head *hh; + int error = 0; + + h = malloc(sizeof(struct hhook), M_HHOOK, + M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT)); + + if (h == NULL) + return (ENOMEM); + + h->h_helper = helper; + h->h_func = hook; + h->h_udata = udata; + + hh = get_hhook_head(hhook_type, hhook_id, NULL, WLOCK_HHOOK_HEAD); + + if (hh == NULL) { + free(h, M_HHOOK); + return (ENOENT); + } + + STAILQ_FOREACH(tmp, &hh->hh_hooks, h_next) { + if (tmp->h_func == hook && tmp->h_udata == udata) { + error = EEXIST; + break; + } + } + + if (!error) { + STAILQ_INSERT_TAIL(&hh->hh_hooks, h, h_next); + hh->hh_nhooks++; + } + else + free(h, M_HHOOK); + + HHOOK_HEAD_WUNLOCK(hh); + + return (error); +} + +int +deregister_hhook(int hhook_type, int hhook_id, hhook_func_t hook, void *udata, + int flags) +{ + struct hhook *tmp; + struct hhook_head *hh; + + hh = get_hhook_head(hhook_type, hhook_id, NULL, WLOCK_HHOOK_HEAD); + + if (hh == NULL) + return (ENOENT); + + STAILQ_FOREACH(tmp, &hh->hh_hooks, h_next) { + if (tmp->h_func == hook && tmp->h_udata == udata) { + STAILQ_REMOVE(&hh->hh_hooks, tmp, hhook, h_next); + free(tmp, M_HHOOK); + hh->hh_nhooks--; + break; + } + } + + HHOOK_HEAD_WUNLOCK(hh); + return (0); +} + +void +run_hhooks(int hhook_type, int hhook_id, void *ctx_data, + struct helper_dblocks *hdbs) +{ + struct hhook_head *hh; + struct hhook *tmp; + struct rm_priotracker rmpt; + int i = 0; + void *dblock = NULL; + uint32_t nblocks = hdbs->nblocks; + + hh = get_hhook_head(hhook_type, hhook_id, &rmpt, RLOCK_HHOOK_HEAD); + + if (hh == NULL) + return; + + STAILQ_FOREACH(tmp, &hh->hh_hooks, h_next) { + //printf("Running hook %p for helper %d\n", tmp, + //tmp->h_helper->id); + if (tmp->h_helper->h_flags & HELPER_NEEDS_DBLOCK) { + if (nblocks == 0 + || i >= nblocks + || tmp->h_helper->h_id != hdbs->blocks[i].hd_id) + continue; + dblock = hdbs->blocks[i].hd_block; + i++; + } + tmp->h_func(tmp->h_udata, ctx_data, dblock, hdbs); + dblock = NULL; + } + + HHOOK_HEAD_RUNLOCK(hh, &rmpt); +} + + +/* + * Private KPI functions + */ +static struct hhook_head * +get_hhook_head(int hhook_type, int hhook_id, struct rm_priotracker *rmpt, + int flags) +{ + struct hhook_head *tmp, *ret = NULL; + + /*KASSERT(HHOOK_HEAD_LIST_LOCK_ASSERT(), ("hhook_head_list_lock not + * locked"));*/ + + LIST_FOREACH(tmp, &V_hhook_head_list, hh_next) { + if (tmp->hh_type == hhook_type && tmp->hh_id == hhook_id) { + ret = tmp; + if (flags & RLOCK_HHOOK_HEAD) + HHOOK_HEAD_RLOCK(ret, rmpt); + else if (flags & WLOCK_HHOOK_HEAD) + HHOOK_HEAD_WLOCK(ret); + break; + } + } + + return (ret); +} + +static int +vnet_hhook_init(const void *unused) +{ + + LIST_INIT(&V_hhook_head_list); + return (0); +} + +static int +vnet_hhook_uninit(const void *unused) +{ + + return (0); +} + +#define HHOOK_SYSINIT_ORDER SI_SUB_PROTO_BEGIN +#define HHOOK_MODEVENT_ORDER (SI_ORDER_FIRST) +#define HHOOK_VNET_ORDER (HHOOK_MODEVENT_ORDER + 2) + +VNET_SYSINIT(vnet_hhook_init, HHOOK_SYSINIT_ORDER, HHOOK_VNET_ORDER, + vnet_hhook_init, NULL); + +VNET_SYSUNINIT(vnet_hhook_uninit, HHOOK_SYSINIT_ORDER, HHOOK_VNET_ORDER, + vnet_hhook_uninit, NULL); + Index: netinet/hhooks.h =================================================================== --- netinet/hhooks.h (.../head/sys) (revision 0) +++ netinet/hhooks.h (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, + * made possible in part by a grant from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_HHOOKS_H_ +#define _NETINET_HHOOKS_H_ + +#define HHOOK_WAITOK 0x01 +#define HHOOK_NOWAIT 0x02 + +#define HHOOK_TYPE_TCP 1 + +struct helper; +struct helper_dblocks; +struct hhook_head; + +typedef void (*hhook_func_t)(void *udata, void *ctx_data, void *helper_dblock, + struct helper_dblocks *hdbs); + +int register_hhook_head(int hhook_type, int hhook_id, int flags); +int deregister_hhook_head(int hhook_type, int hhook_id); +int register_hhook(int hhook_type, int hhook_id, struct helper *helper, + hhook_func_t hook, void *udata, int flags); +int deregister_hhook(int hhook_type, int hhook_id, hhook_func_t hook, + void *udata, int flags); +void run_hhooks(int hhook_type, int hhook_id, void *ctx_data, + struct helper_dblocks *hdbs); + +#define HHOOK_HEAD_LIST_LOCK() mtx_lock(&hhook_head_list_lock) +#define HHOOK_HEAD_LIST_UNLOCK() mtx_unlock(&hhook_head_list_lock) +#define HHOOK_HEAD_LIST_LOCK_ASSERT() mtx_assert(&hhook_head_list_lock, MA_OWNED) + +#define HHOOK_HEAD_LOCK_INIT(hh) rm_init(&(hh)->hh_lock, "hhook_head rm lock") +#define HHOOK_HEAD_LOCK_DESTROY(hh) rm_destroy(&(hh)->hh_lock) +#define HHOOK_HEAD_WLOCK(hh) rm_wlock(&(hh)->hh_lock) +#define HHOOK_HEAD_WUNLOCK(hh) rm_wunlock(&(hh)->hh_lock) +#define HHOOK_HEAD_RLOCK(hh,rmpt) rm_rlock(&(hh)->hh_lock, (rmpt)) +#define HHOOK_HEAD_RUNLOCK(hh,rmpt) rm_runlock(&(hh)->hh_lock, (rmpt)) + +#endif /* _NETINET_HHOOKS_H_ */ + Index: netinet/helper_module.h =================================================================== --- netinet/helper_module.h (.../head/sys) (revision 0) +++ netinet/helper_module.h (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2010 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart, + * made possible in part by a grant from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_HELPER_MODULE_H_ +#define _NETINET_HELPER_MODULE_H_ + +struct helper_modevent_data { + char name[HELPER_NAME_MAXLEN]; + struct helper *helper; + int uma_zsize; + uma_ctor umactor; + uma_dtor umadtor; +}; + +#define DECLARE_HELPER(hname, hdata, version) \ + static struct helper_modevent_data hmd_##hname = { \ + .name = #hname, \ + .helper = hdata \ + }; \ + static moduledata_t h_##hname = { \ + .name = #hname, \ + .evhand = helper_modevent, \ + .priv = &hmd_##hname \ + }; \ + DECLARE_MODULE(hname, h_##hname, SI_SUB_PROTO_IFATTACHDOMAIN, \ + SI_ORDER_ANY); \ + MODULE_VERSION(hname, version) + +#define DECLARE_HELPER_UMA(hname, hdata, version, size, ctor, dtor) \ + static struct helper_modevent_data hmd_##hname = { \ + .name = #hname, \ + .helper = hdata, \ + .uma_zsize = size, \ + .umactor = ctor, \ + .umadtor = dtor \ + }; \ + static moduledata_t h_##hname = { \ + .name = #hname, \ + .evhand = helper_modevent, \ + .priv = &hmd_##hname \ + }; \ + DECLARE_MODULE(hname, h_##hname, SI_SUB_PROTO_IFATTACHDOMAIN, \ + SI_ORDER_ANY); \ + MODULE_VERSION(hname, version) + +int helper_modevent(module_t mod, int type, void *data); + +MALLOC_DECLARE(M_HELPER); +MALLOC_DEFINE(M_HELPER, "helper data", "Blah"); + + +#endif /* _NETINET_HELPER_MODULE_H_ */ Index: netinet/tcp_usrreq.c =================================================================== --- netinet/tcp_usrreq.c (.../head/sys) (revision 203910) +++ netinet/tcp_usrreq.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -62,6 +62,7 @@ #include #include +#include #include #include #ifdef INET6 @@ -77,7 +78,6 @@ #include #include #endif -#include #include #include #include @@ -1238,6 +1238,8 @@ struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; + char buf[TCP_CA_NAME_MAX]; + struct cc_algo *algo; error = 0; inp = sotoinpcb(so); @@ -1347,6 +1349,55 @@ error = EINVAL; break; + case TCP_CONGESTION: + INP_WUNLOCK(inp); + bzero(buf, sizeof(buf)); + error = sooptcopyin(sopt, &buf, sizeof(buf), 1); + if (error) + break; + INP_WLOCK_RECHECK(inp); + /* + * Return EINVAL if we can't find the requested cc algo. + */ + error = EINVAL; + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) { + if ( strncmp(buf, + algo->name, + TCP_CA_NAME_MAX) == 0) { + /* We've found the requested algo. */ + error = 0; + /* + * We hold a write lock over the tcb + * so it's safe to do these things + * without ordering concerns. + */ + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp); + CC_ALGO(tp) = algo; + /* + * If something goes pear shaped + * initialising the new algo, + * fall back to newreno (which + * does not require initialisation). + */ + if (algo->cb_init != NULL) + if (algo->cb_init(tp) > 0) { + CC_ALGO(tp) = &newreno_cc_algo; + /* + * The only reason init + * should fail is + * because of malloc. + */ + error = ENOMEM; + } + break; /* Break the STAILQ_FOREACH. */ + } + } + CC_LIST_RUNLOCK(); + INP_WUNLOCK(inp); + break; + default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1390,6 +1441,12 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; + case TCP_CONGESTION: + bzero(buf, sizeof(buf)); + strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; Index: netinet/cc.c =================================================================== --- netinet/cc.c (.../head/sys) (revision 0) +++ netinet/cc.c (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,312 @@ +/*- + * Copyright (c) 2007-2009 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2009 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +/* + * List of available cc algorithms on the current system. First element + * is used as the system default CC algorithm. + */ +struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); + +/* Protects the cc_list TAILQ */ +struct rwlock cc_list_lock; + +/* + * Set the default CC algorithm to new_default. The default is identified + * by being the first element in the cc_list TAILQ. + */ +static void +cc_set_default(struct cc_algo *new_default) +{ + CC_LIST_WLOCK_ASSERT(); + + /* + * Make the requested system default CC + * algorithm the first element in the list + * if it isn't already + */ + if (new_default != CC_DEFAULT()) { + STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries); + STAILQ_INSERT_HEAD(&cc_list, new_default, entries); + } +} + +/* + * Sysctl handler to show and change the default CC algorithm. + */ +static int +cc_default_algo(SYSCTL_HANDLER_ARGS) +{ + struct cc_algo *funcs; + int error = 0, found = 0; + + if (req->newptr == NULL) { + /* Just print the current default. */ + char default_cc[TCP_CA_NAME_MAX]; + CC_LIST_RLOCK(); + strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX); + CC_LIST_RUNLOCK(); + error = sysctl_handle_string(oidp, default_cc, 1, req); + } else { + /* Find algo with specified name and set it to default. */ + CC_LIST_WLOCK(); + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (strncmp((char *)req->newptr, funcs->name, TCP_CA_NAME_MAX) == 0) { + found = 1; + cc_set_default(funcs); + } + } + CC_LIST_WUNLOCK(); + + if (!found) + return (ESRCH); + } + + return (error); +} + +/* + * Sysctl handler to display the list of available CC algorithms. + */ +static int +cc_list_available(SYSCTL_HANDLER_ARGS) +{ + struct cc_algo *algo; + int error = 0, first = 1; + struct sbuf *s = NULL; + + if ((s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND)) == NULL) + return -1; + + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) { + error = sbuf_printf(s, (first) ? "%s" : ", %s", algo->name); + if (error != 0) + break; + first = 0; + } + CC_LIST_RUNLOCK(); + + if (!error) { + sbuf_finish(s); + error = sysctl_handle_string(oidp, sbuf_data(s), 1, req); + } + + sbuf_delete(s); + return (error); +} + +/* + * Initialise CC subsystem on system boot. + */ +void +cc_init() +{ + CC_LIST_LOCK_INIT(); + STAILQ_INIT(&cc_list); +} + +/* + * Returns non-zero on success, 0 on failure. + */ +int +cc_deregister_algo(struct cc_algo *remove_cc) +{ + struct cc_algo *funcs, *tmpfuncs; + struct tcpcb *tp = NULL; + struct inpcb *inp = NULL; + int error = EPERM; + + /* Never allow newreno to be deregistered. */ + if (&newreno_cc_algo == remove_cc) + return error; + + /* Remove algo from cc_list so that new connections can't use it. */ + CC_LIST_WLOCK(); + STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { + if (funcs == remove_cc) { + /* + * If we're removing the current system default, + * reset the default to newreno. + */ + if (strncmp(CC_DEFAULT()->name, + remove_cc->name, + TCP_CA_NAME_MAX) == 0) + cc_set_default(&newreno_cc_algo); + + STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); + error = 0; + break; + } + } + CC_LIST_WUNLOCK(); + + if (!error) { + /* + * Check all active control blocks and change any that are + * using this algorithm back to newreno. If the algorithm that + * was in use requires cleanup code to be run, call it. + * + * New connections already part way through being initialised + * with the CC algo we're removing will not race with this code + * because the INP_INFO_WLOCK is held during initialisation. + * We therefore don't enter the loop below until the connection + * list has stabilised. + */ + INP_INFO_RLOCK(&V_tcbinfo); + LIST_FOREACH(inp, &V_tcb, inp_list) { + INP_WLOCK(inp); + /* Important to skip tcptw structs. */ + if (!(inp->inp_flags & INP_TIMEWAIT) && + (tp = intotcpcb(inp)) != NULL) { + /* + * By holding INP_WLOCK here, we are + * assured that the connection is not + * currently executing inside the CC + * module's functions i.e. it is safe to + * make the switch back to newreno. + */ + if (CC_ALGO(tp) == remove_cc) { + tmpfuncs = CC_ALGO(tp); + /* Newreno does not require any init. */ + CC_ALGO(tp) = &newreno_cc_algo; + if (tmpfuncs->cb_destroy != NULL) + tmpfuncs->cb_destroy(tp); + } + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_tcbinfo); + } + + return (error); +} + +/* + * Returns 0 on success, non-zero on failure. + */ +int +cc_register_algo(struct cc_algo *add_cc) +{ + struct cc_algo *funcs; + int error = 0; + + /* + * Iterate over list of registered CC algorithms and make sure + * we're not trying to add a duplicate. + */ + CC_LIST_WLOCK(); + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (funcs == add_cc || + strncmp(funcs->name, add_cc->name, TCP_CA_NAME_MAX) == 0) + error = EEXIST; + } + + if (!error) + STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); + + CC_LIST_WUNLOCK(); + + return (error); +} + +/* + * Handles kld related events. Returns 0 on success, non-zero on failure. + */ +int +cc_modevent(module_t mod, int event_type, void *data) +{ + int error = 0; + struct cc_algo *algo = (struct cc_algo *)data; + + switch(event_type) { + case MOD_LOAD: + if (algo->mod_init != NULL) + error = algo->mod_init(); + if (!error) + error = cc_register_algo(algo); + break; + + case MOD_QUIESCE: + error = cc_deregister_algo(algo); + if (!error && algo->mod_destroy != NULL) + algo->mod_destroy(); + break; + + case MOD_SHUTDOWN: + case MOD_UNLOAD: + break; + + default: + return EINVAL; + break; + } + + return (error); +} + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, + "congestion control related settings"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, + NULL, 0, cc_default_algo, "A", + "default congestion control algorithm"); + +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, cc_list_available, "A", + "list available congestion control algorithms"); Index: netinet/cc.h =================================================================== --- netinet/cc.h (.../head/sys) (revision 0) +++ netinet/cc.h (.../projects/tcp_cc_head/sys) (revision 203947) @@ -0,0 +1,120 @@ +/*- + * Copyright (c) 2008-2009 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2009 Lawrence Stewart + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_CC_H_ +#define _NETINET_CC_H_ + +/* Needed for TCP_CA_NAME_MAX define which lives in tcp.h for compat reasons. */ +#include + +/* + * Global CC vars. + */ +extern STAILQ_HEAD(cc_head, cc_algo) cc_list; +extern const int tcprexmtthresh; +extern struct cc_algo newreno_cc_algo; + +/* + * Define the new net.inet.tcp.cc sysctl tree. + */ +SYSCTL_DECL(_net_inet_tcp_cc); + +/* + * CC housekeeping functions. + */ +void cc_init(void); +int cc_register_algo(struct cc_algo *add_cc); +int cc_deregister_algo(struct cc_algo *remove_cc); + +/* + * Structure to hold data and function pointers that together represent + * a congestion control algorithm. + * Based on similar structure in the SCTP stack. + */ +struct cc_algo { + char name[TCP_CA_NAME_MAX]; + + /* Init global module state on kldload. */ + int (*mod_init) (void); + + /* Cleanup global module state on kldunload. */ + int (*mod_destroy) (void); + + /* Init CC state for a new control block. */ + int (*cb_init) (struct tcpcb *tp); + + /* Cleanup CC state for a terminating control block. */ + void (*cb_destroy) (struct tcpcb *tp); + + /* Init variables for a newly established connection. */ + void (*conn_init) (struct tcpcb *tp); + + /* Called on receipt of a regular, valid ack. */ + void (*ack_received) (struct tcpcb *tp, struct tcphdr *th); + + /* Called before entering FR. */ + void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* Called after exiting FR. */ + void (*post_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* Called when data transfer resumes after an idle period. */ + void (*after_idle) (struct tcpcb *tp); + + /* Called each time the connection's retransmit timer fires. */ + void (*after_timeout) (struct tcpcb *tp); + + STAILQ_ENTRY(cc_algo) entries; +}; + +/* Macro to obtain the CC algo's struct ptr. */ +#define CC_ALGO(tp) ((tp)->cc_algo) + +/* Macro to obtain the CC algo's data ptr. */ +#define CC_DATA(tp) ((tp)->cc_data) + +/* Macro to obtain the system default CC algo's struct ptr. */ +#define CC_DEFAULT() STAILQ_FIRST(&cc_list) + +extern struct rwlock cc_list_lock; +#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") +#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) +#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) +#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) +#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) +#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) +#define CC_LIST_WLOCK_ASSERT() rw_assert(&cc_list_lock, RA_WLOCKED) + +#endif /* _NETINET_CC_H_ */