From: Vitaliy Makkoveev Subject: Re: TCP sysctl keep alive unlock To: Alexander Bluhm Cc: tech@openbsd.org Date: Thu, 16 Jan 2025 13:26:39 +0300 On Wed, Jan 15, 2025 at 11:40:26PM +0100, Alexander Bluhm wrote: > On Wed, Jan 15, 2025 at 09:13:17PM +0300, Vitaliy Makkoveev wrote: > > On Wed, Jan 15, 2025 at 04:16:12PM +0100, Alexander Bluhm wrote: > > > Hi, > > > > > > By converting the TCP keepalive timer variables to seconds, sysctl > > > for those variables can be made atomic easily. Then we have to > > > multiply with TCP_TIME(1) when using them. > > > > > > An alternative would be to switch the sysctl values from seconds > > > to milliseconds, but that would mean an API break for people with > > > sysctl.conf modifications. > > > > > > ok? > > > > I prefer to use timeout_add_sec(9) within TCP_TIMER_ARM() macro. sysctl > > variables have seconds resolution, so you can just convert them to > > ticks and avoid one extra multiplication and one division. > > This would require a TCP_TIMER_ARM_SEC() macro. And we would mix > units for different timers. I tried this, it did not look nice. > > New idea: We keep copies in seconds for the sysctl and update > timer variables when they change. > > While there make the names consistent. tcp_maxidle was historically > calculated in tcp_slowtimo() as the timers were called from there. > Much easier to calculate maxidle when needed. tcp_timer_init() looks > pretty useless, just initialize data. > > ok? > Much better. ok mvs > bluhm > > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.422 tcp_input.c > --- netinet/tcp_input.c 10 Jan 2025 20:19:03 -0000 1.422 > +++ netinet/tcp_input.c 15 Jan 2025 22:06:50 -0000 > @@ -109,7 +109,6 @@ int tcp_flush_queue(struct tcpcb *); > #endif /* INET6 */ > > const int tcprexmtthresh = 3; > -int tcptv_keep_init = TCPTV_KEEP_INIT; > > int tcp_rst_ppslim = 100; /* 100pps */ > int tcp_rst_ppslim_count = 0; > @@ -870,7 +869,7 @@ findpcb: > */ > tp->t_rcvtime = now; > if (TCPS_HAVEESTABLISHED(tp->t_state)) > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); > + TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle)); > > if (tp->sack_enable) > tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ > @@ -1187,7 +1186,8 @@ findpcb: > soisconnected(so); > tp->t_flags &= ~TF_BLOCKOUTPUT; > tp->t_state = TCPS_ESTABLISHED; > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); > + TCP_TIMER_ARM(tp, TCPT_KEEP, > + atomic_load_int(&tcp_keepidle)); > /* Do window scaling on this connection? */ > if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == > (TF_RCVD_SCALE|TF_REQ_SCALE)) { > @@ -1473,7 +1473,7 @@ trimthenstep6: > soisconnected(so); > tp->t_flags &= ~TF_BLOCKOUTPUT; > tp->t_state = TCPS_ESTABLISHED; > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); > + TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle)); > /* Do window scaling? */ > if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == > (TF_RCVD_SCALE|TF_REQ_SCALE)) { > @@ -1809,10 +1809,14 @@ trimthenstep6: > * we'll hang forever. > */ > if (so->so_rcv.sb_state & SS_CANTRCVMORE) { > + int maxidle; > + > tp->t_flags |= TF_BLOCKOUTPUT; > soisdisconnected(so); > tp->t_flags &= ~TF_BLOCKOUTPUT; > - TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); > + maxidle = TCPTV_KEEPCNT * > + atomic_load_int(&tcp_keepidle); > + TCP_TIMER_ARM(tp, TCPT_2MSL, maxidle); > } > tp->t_state = TCPS_FIN_WAIT_2; > } > @@ -3394,7 +3398,7 @@ syn_cache_timer(void *arg) > * than the keep alive timer would allow, expire it. > */ > sc->sc_rxttot += sc->sc_rxtcur; > - if (sc->sc_rxttot >= READ_ONCE(tcptv_keep_init)) > + if (sc->sc_rxttot >= atomic_load_int(&tcp_keepinit)) > goto dropit; > > /* Advance the timer back-off. */ > @@ -3673,7 +3677,7 @@ syn_cache_get(struct sockaddr *src, stru > tp->t_sndtime = now; > tp->t_rcvacktime = now; > tp->t_sndacktime = now; > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); > + TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepinit)); > tcpstat_inc(tcps_accepts); > > tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ > Index: netinet/tcp_subr.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v > diff -u -p -r1.204 tcp_subr.c > --- netinet/tcp_subr.c 3 Jan 2025 17:23:51 -0000 1.204 > +++ netinet/tcp_subr.c 15 Jan 2025 21:24:52 -0000 > @@ -184,9 +184,6 @@ tcp_init(void) > > /* Initialize the compressed state engine. */ > syn_cache_init(); > - > - /* Initialize timer state. */ > - tcp_timer_init(); > } > > /* > Index: netinet/tcp_timer.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v > diff -u -p -r1.81 tcp_timer.c > --- netinet/tcp_timer.c 14 Jan 2025 13:49:44 -0000 1.81 > +++ netinet/tcp_timer.c 15 Jan 2025 22:04:22 -0000 > @@ -61,16 +61,14 @@ > */ > > int tcp_always_keepalive; > -int tcp_keepidle; > -int tcp_keepintvl; > -int tcp_maxpersistidle; /* max idle time in persist */ > -int tcp_maxidle; /* [T] max idle time for keep alive */ > - > -/* > - * Time to delay the ACK. This is initialized in tcp_init(), unless > - * its patched. > - */ > -int tcp_delack_msecs; > +int tcp_keepinit = TCPTV_KEEPINIT; > +int tcp_keepidle = TCPTV_KEEPIDLE; > +int tcp_keepintvl = TCPTV_KEEPINTVL; > +int tcp_keepinit_sec = TCPTV_KEEPINIT / TCP_TIME(1); > +int tcp_keepidle_sec = TCPTV_KEEPIDLE / TCP_TIME(1); > +int tcp_keepintvl_sec = TCPTV_KEEPINTVL / TCP_TIME(1); > +int tcp_maxpersistidle = TCPTV_KEEPIDLE; /* max idle time in persist */ > +int tcp_delack_msecs = TCP_DELACK_MSECS; /* time to delay the ACK */ > > void tcp_timer_rexmt(void *); > void tcp_timer_persist(void *); > @@ -86,26 +84,6 @@ const tcp_timer_func_t tcp_timer_funcs[T > tcp_timer_delack, > }; > > -/* > - * Timer state initialization, called from tcp_init(). > - */ > -void > -tcp_timer_init(void) > -{ > - > - if (tcp_keepidle == 0) > - tcp_keepidle = TCPTV_KEEP_IDLE; > - > - if (tcp_keepintvl == 0) > - tcp_keepintvl = TCPTV_KEEPINTVL; > - > - if (tcp_maxpersistidle == 0) > - tcp_maxpersistidle = TCPTV_KEEP_IDLE; > - > - if (tcp_delack_msecs == 0) > - tcp_delack_msecs = TCP_DELACK_MSECS; > -} > - > static inline int > tcp_timer_enter(struct inpcb *inp, struct socket **so, struct tcpcb **tp, > u_int timer) > @@ -176,7 +154,6 @@ void > tcp_slowtimo(void) > { > mtx_enter(&tcp_timer_mtx); > - tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl; > tcp_iss += TCP_ISSINCR2/PR_SLOWHZ; /* increment iss */ > mtx_leave(&tcp_timer_mtx); > } > @@ -488,13 +465,14 @@ tcp_timer_keep(void *arg) > if ((atomic_load_int(&tcp_always_keepalive) || > so->so_options & SO_KEEPALIVE) && > tp->t_state <= TCPS_CLOSING) { > - int maxidle; > + int keepidle, maxidle; > uint64_t now; > > - maxidle = READ_ONCE(tcp_maxidle); > + keepidle = atomic_load_int(&tcp_keepidle); > + maxidle = TCPTV_KEEPCNT * keepidle; > now = tcp_now(); > if ((maxidle > 0) && > - ((now - tp->t_rcvtime) >= tcp_keepidle + maxidle)) { > + ((now - tp->t_rcvtime) >= keepidle + maxidle)) { > tcpstat_inc(tcps_keepdrops); > tp = tcp_drop(tp, ETIMEDOUT); > goto out; > @@ -514,9 +492,9 @@ tcp_timer_keep(void *arg) > tcpstat_inc(tcps_keepprobe); > tcp_respond(tp, mtod(tp->t_template, caddr_t), > NULL, tp->rcv_nxt, tp->snd_una - 1, 0, 0, now); > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl); > + TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepintvl)); > } else > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); > + TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle)); > if (otp) > tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_KEEP, 0); > out: > @@ -542,11 +520,11 @@ tcp_timer_2msl(void *arg) > } > tcp_timer_freesack(tp); > > - maxidle = READ_ONCE(tcp_maxidle); > + maxidle = TCPTV_KEEPCNT * atomic_load_int(&tcp_keepidle); > now = tcp_now(); > if (tp->t_state != TCPS_TIME_WAIT && > ((maxidle == 0) || ((now - tp->t_rcvtime) <= maxidle))) > - TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl); > + TCP_TIMER_ARM(tp, TCPT_2MSL, atomic_load_int(&tcp_keepintvl)); > else > tp = tcp_close(tp); > if (otp) > Index: netinet/tcp_timer.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v > diff -u -p -r1.25 tcp_timer.h > --- netinet/tcp_timer.h 3 Jan 2025 17:23:51 -0000 1.25 > +++ netinet/tcp_timer.h 15 Jan 2025 22:04:57 -0000 > @@ -93,8 +93,8 @@ > #define TCPTV_PERSMIN TCP_TIME(5) /* retransmit persistence */ > #define TCPTV_PERSMAX TCP_TIME(60) /* maximum persist interval */ > > -#define TCPTV_KEEP_INIT TCP_TIME(75) /* initial connect keep alive */ > -#define TCPTV_KEEP_IDLE TCP_TIME(120*60) /* dflt time before probing */ > +#define TCPTV_KEEPINIT TCP_TIME(75) /* initial connect keep alive */ > +#define TCPTV_KEEPIDLE TCP_TIME(120*60) /* dflt time before probing */ > #define TCPTV_KEEPINTVL TCP_TIME(75) /* default probe interval */ > #define TCPTV_KEEPCNT 8 /* max probes before drop */ > > @@ -154,16 +154,17 @@ typedef void (*tcp_timer_func_t)(void *) > > extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS]; > > -extern int tcp_delack_msecs; /* delayed ACK timeout in millisecs */ > -extern int tcptv_keep_init; > -extern int tcp_always_keepalive; /* [a] assume SO_KEEPALIVE always set */ > -extern int tcp_keepidle; /* time before keepalive probes begin */ > -extern int tcp_keepintvl; /* time between keepalive probes */ > -extern int tcp_maxidle; /* time to drop after starting probes */ > -extern int tcp_ttl; /* time to live for TCP segs */ > +extern int tcp_delack_msecs; /* [I] delayed ACK timeout in millisecs */ > +extern int tcp_always_keepalive;/* [a] assume SO_KEEPALIVE always set */ > +extern int tcp_keepinit; /* [a] time to keep alive initial SYN packet */ > +extern int tcp_keepidle; /* [a] time before keepalive probes begin */ > +extern int tcp_keepintvl; /* [a] time between keepalive probes */ > +extern int tcp_keepinit_sec; /* [a] copy of above in seconds for sysctl */ > +extern int tcp_keepidle_sec; /* [a] copy of above in seconds for sysctl */ > +extern int tcp_keepintvl_sec; /* [a] copy of above in seconds for sysctl */ > +extern int tcp_ttl; /* time to live for TCP segs */ > extern const int tcp_backoff[]; > > -void tcp_timer_init(void); > void tcp_timer_reaper(void *); > > #endif /* _KERNEL */ > Index: netinet/tcp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v > diff -u -p -r1.239 tcp_usrreq.c > --- netinet/tcp_usrreq.c 9 Jan 2025 16:47:24 -0000 1.239 > +++ netinet/tcp_usrreq.c 15 Jan 2025 22:13:17 -0000 > @@ -160,6 +160,12 @@ const struct pr_usrreqs tcp6_usrreqs = { > #endif > > const struct sysctl_bounded_args tcpctl_vars[] = { > + { TCPCTL_KEEPINITTIME, &tcp_keepinit_sec, 1, > + 3 * TCPTV_KEEPINIT / TCP_TIME(1) }, > + { TCPCTL_KEEPIDLE, &tcp_keepidle_sec, 1, > + 5 * TCPTV_KEEPIDLE / TCP_TIME(1) }, > + { TCPCTL_KEEPINTVL, &tcp_keepintvl_sec, 1, > + 3 * TCPTV_KEEPINTVL / TCP_TIME(1) }, > { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, > { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, > { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, > @@ -682,7 +688,7 @@ tcp_connect(struct socket *so, struct mb > soisconnecting(so); > tcpstat_inc(tcps_connattempt); > tp->t_state = TCPS_SYN_SENT; > - TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); > + TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepinit)); > tcp_set_iss_tsm(tp); > tcp_sendseqinit(tp); > tp->snd_last = tp->snd_una; > @@ -1107,8 +1113,13 @@ tcp_usrclosed(struct tcpcb *tp) > * a full close, we start a timer to make sure sockets are > * not left in FIN_WAIT_2 forever. > */ > - if (tp->t_state == TCPS_FIN_WAIT_2) > - TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); > + if (tp->t_state == TCPS_FIN_WAIT_2) { > + int maxidle; > + > + maxidle = TCPTV_KEEPCNT * > + atomic_load_int(&tcp_keepidle); > + TCP_TIMER_ARM(tp, TCPT_2MSL, maxidle); > + } > } > return (tp); > } > @@ -1409,36 +1420,6 @@ tcp_sysctl(int *name, u_int namelen, voi > return (ENOTDIR); > > switch (name[0]) { > - case TCPCTL_KEEPINITTIME: > - NET_LOCK(); > - nval = tcptv_keep_init / TCP_TIME(1); > - error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, > - 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); > - if (!error) > - tcptv_keep_init = TCP_TIME(nval); > - NET_UNLOCK(); > - return (error); > - > - case TCPCTL_KEEPIDLE: > - NET_LOCK(); > - nval = tcp_keepidle / TCP_TIME(1); > - error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, > - 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); > - if (!error) > - tcp_keepidle = TCP_TIME(nval); > - NET_UNLOCK(); > - return (error); > - > - case TCPCTL_KEEPINTVL: > - NET_LOCK(); > - nval = tcp_keepintvl / TCP_TIME(1); > - error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, > - 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); > - if (!error) > - tcp_keepintvl = TCP_TIME(nval); > - NET_UNLOCK(); > - return (error); > - > case TCPCTL_BADDYNAMIC: > NET_LOCK(); > error = sysctl_struct(oldp, oldlenp, newp, newlen, > @@ -1528,8 +1509,23 @@ tcp_sysctl(int *name, u_int namelen, voi > return (error); > > default: > - return sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), > + error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), > name, namelen, oldp, oldlenp, newp, newlen); > + switch (name[0]) { > + case TCPCTL_KEEPINITTIME: > + atomic_store_int(&tcp_keepinit, > + atomic_load_int(&tcp_keepinit_sec) * TCP_TIME(1)); > + break; > + case TCPCTL_KEEPIDLE: > + atomic_store_int(&tcp_keepidle, > + atomic_load_int(&tcp_keepidle_sec) * TCP_TIME(1)); > + break; > + case TCPCTL_KEEPINTVL: > + atomic_store_int(&tcp_keepintvl, > + atomic_load_int(&tcp_keepintvl_sec) * TCP_TIME(1)); > + break; > + } > + return (error); > } > /* NOTREACHED */ > } > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.184 tcp_var.h > --- netinet/tcp_var.h 5 Jan 2025 12:18:48 -0000 1.184 > +++ netinet/tcp_var.h 15 Jan 2025 20:22:23 -0000 > @@ -679,7 +679,6 @@ extern struct pool tcpcb_pool; > extern struct inpcbtable tcbtable, tcb6table; /* queue of active tcpcb's */ > extern int tcp_do_rfc1323; /* [a] enabled/disabled? */ > extern const int tcprexmtthresh; > -extern int tcptv_keep_init; /* [N] time to keep alive initial SYN packet */ > extern int tcp_mssdflt; /* [a] default maximum segment size */ > extern int tcp_rst_ppslim; /* [a] maximum outgoing RST packet per second */ > extern int tcp_ack_on_push; /* [a] ACK immediately on PUSH */ >