Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: TCP sysctl keep alive unlock
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Wed, 15 Jan 2025 23:40:26 +0100

Download raw body.

Thread
On Wed, Jan 15, 2025 at 09:13:17PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Jan 15, 2025 at 04:16:12PM +0100, Alexander Bluhm wrote:
> > Hi,
> > 
> > By converting the TCP keepalive timer variables to seconds, sysctl
> > for those variables can be made atomic easily.  Then we have to
> > multiply with TCP_TIME(1) when using them.
> > 
> > An alternative would be to switch the sysctl values from seconds
> > to milliseconds, but that would mean an API break for people with
> > sysctl.conf modifications.
> > 
> > ok?
> 
> I prefer to use timeout_add_sec(9) within TCP_TIMER_ARM() macro. sysctl
> variables have seconds resolution, so you can just convert them to
> ticks and avoid one extra multiplication and one division.

This would require a TCP_TIMER_ARM_SEC() macro.  And we would mix
units for different timers.  I tried this, it did not look nice.

New idea:  We keep copies in seconds for the sysctl and update
timer variables when they change.

While there make the names consistent.  tcp_maxidle was historically
calculated in tcp_slowtimo() as the timers were called from there.
Much easier to calculate maxidle when needed.  tcp_timer_init() looks
pretty useless, just initialize data.

ok?

bluhm

Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.422 tcp_input.c
--- netinet/tcp_input.c	10 Jan 2025 20:19:03 -0000	1.422
+++ netinet/tcp_input.c	15 Jan 2025 22:06:50 -0000
@@ -109,7 +109,6 @@ int tcp_flush_queue(struct tcpcb *);
 #endif /* INET6 */
 
 const int tcprexmtthresh = 3;
-int tcptv_keep_init = TCPTV_KEEP_INIT;
 
 int tcp_rst_ppslim = 100;		/* 100pps */
 int tcp_rst_ppslim_count = 0;
@@ -870,7 +869,7 @@ findpcb:
 	 */
 	tp->t_rcvtime = now;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle));
 
 	if (tp->sack_enable)
 		tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
@@ -1187,7 +1186,8 @@ findpcb:
 			soisconnected(so);
 			tp->t_flags &= ~TF_BLOCKOUTPUT;
 			tp->t_state = TCPS_ESTABLISHED;
-			TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+			TCP_TIMER_ARM(tp, TCPT_KEEP,
+			    atomic_load_int(&tcp_keepidle));
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
@@ -1473,7 +1473,7 @@ trimthenstep6:
 		soisconnected(so);
 		tp->t_flags &= ~TF_BLOCKOUTPUT;
 		tp->t_state = TCPS_ESTABLISHED;
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle));
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
@@ -1809,10 +1809,14 @@ trimthenstep6:
 				 * we'll hang forever.
 				 */
 				if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
+					int maxidle;
+
 					tp->t_flags |= TF_BLOCKOUTPUT;
 					soisdisconnected(so);
 					tp->t_flags &= ~TF_BLOCKOUTPUT;
-					TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
+					maxidle = TCPTV_KEEPCNT *
+					    atomic_load_int(&tcp_keepidle);
+					TCP_TIMER_ARM(tp, TCPT_2MSL, maxidle);
 				}
 				tp->t_state = TCPS_FIN_WAIT_2;
 			}
@@ -3394,7 +3398,7 @@ syn_cache_timer(void *arg)
 	 * than the keep alive timer would allow, expire it.
 	 */
 	sc->sc_rxttot += sc->sc_rxtcur;
-	if (sc->sc_rxttot >= READ_ONCE(tcptv_keep_init))
+	if (sc->sc_rxttot >= atomic_load_int(&tcp_keepinit))
 		goto dropit;
 
 	/* Advance the timer back-off. */
@@ -3673,7 +3677,7 @@ syn_cache_get(struct sockaddr *src, stru
 	tp->t_sndtime = now;
 	tp->t_rcvacktime = now;
 	tp->t_sndacktime = now;
-	TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
+	TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepinit));
 	tcpstat_inc(tcps_accepts);
 
 	tcp_mss(tp, sc->sc_peermaxseg);	 /* sets t_maxseg */
Index: netinet/tcp_subr.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v
diff -u -p -r1.204 tcp_subr.c
--- netinet/tcp_subr.c	3 Jan 2025 17:23:51 -0000	1.204
+++ netinet/tcp_subr.c	15 Jan 2025 21:24:52 -0000
@@ -184,9 +184,6 @@ tcp_init(void)
 
 	/* Initialize the compressed state engine. */
 	syn_cache_init();
-
-	/* Initialize timer state. */
-	tcp_timer_init();
 }
 
 /*
Index: netinet/tcp_timer.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
diff -u -p -r1.81 tcp_timer.c
--- netinet/tcp_timer.c	14 Jan 2025 13:49:44 -0000	1.81
+++ netinet/tcp_timer.c	15 Jan 2025 22:04:22 -0000
@@ -61,16 +61,14 @@
  */
 
 int	tcp_always_keepalive;
-int	tcp_keepidle;
-int	tcp_keepintvl;
-int	tcp_maxpersistidle;	/* max idle time in persist */
-int	tcp_maxidle;		/* [T] max idle time for keep alive */
-
-/*
- * Time to delay the ACK.  This is initialized in tcp_init(), unless
- * its patched.
- */
-int	tcp_delack_msecs;
+int	tcp_keepinit = TCPTV_KEEPINIT;
+int	tcp_keepidle = TCPTV_KEEPIDLE;
+int	tcp_keepintvl = TCPTV_KEEPINTVL;
+int	tcp_keepinit_sec = TCPTV_KEEPINIT / TCP_TIME(1);
+int	tcp_keepidle_sec = TCPTV_KEEPIDLE / TCP_TIME(1);
+int	tcp_keepintvl_sec = TCPTV_KEEPINTVL / TCP_TIME(1);
+int	tcp_maxpersistidle = TCPTV_KEEPIDLE;	/* max idle time in persist */
+int	tcp_delack_msecs = TCP_DELACK_MSECS;	/* time to delay the ACK */
 
 void	tcp_timer_rexmt(void *);
 void	tcp_timer_persist(void *);
@@ -86,26 +84,6 @@ const tcp_timer_func_t tcp_timer_funcs[T
 	tcp_timer_delack,
 };
 
-/*
- * Timer state initialization, called from tcp_init().
- */
-void
-tcp_timer_init(void)
-{
-
-	if (tcp_keepidle == 0)
-		tcp_keepidle = TCPTV_KEEP_IDLE;
-
-	if (tcp_keepintvl == 0)
-		tcp_keepintvl = TCPTV_KEEPINTVL;
-
-	if (tcp_maxpersistidle == 0)
-		tcp_maxpersistidle = TCPTV_KEEP_IDLE;
-
-	if (tcp_delack_msecs == 0)
-		tcp_delack_msecs = TCP_DELACK_MSECS;
-}
-
 static inline int
 tcp_timer_enter(struct inpcb *inp, struct socket **so, struct tcpcb **tp,
     u_int timer)
@@ -176,7 +154,6 @@ void
 tcp_slowtimo(void)
 {
 	mtx_enter(&tcp_timer_mtx);
-	tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
 	tcp_iss += TCP_ISSINCR2/PR_SLOWHZ;		/* increment iss */
 	mtx_leave(&tcp_timer_mtx);
 }
@@ -488,13 +465,14 @@ tcp_timer_keep(void *arg)
 	if ((atomic_load_int(&tcp_always_keepalive) ||
 	    so->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
-		int maxidle;
+		int keepidle, maxidle;
 		uint64_t now;
 
-		maxidle = READ_ONCE(tcp_maxidle);
+		keepidle = atomic_load_int(&tcp_keepidle);
+		maxidle = TCPTV_KEEPCNT * keepidle;
 		now = tcp_now();
 		if ((maxidle > 0) &&
-		    ((now - tp->t_rcvtime) >= tcp_keepidle + maxidle)) {
+		    ((now - tp->t_rcvtime) >= keepidle + maxidle)) {
 			tcpstat_inc(tcps_keepdrops);
 			tp = tcp_drop(tp, ETIMEDOUT);
 			goto out;
@@ -514,9 +492,9 @@ tcp_timer_keep(void *arg)
 		tcpstat_inc(tcps_keepprobe);
 		tcp_respond(tp, mtod(tp->t_template, caddr_t),
 		    NULL, tp->rcv_nxt, tp->snd_una - 1, 0, 0, now);
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepintvl));
 	} else
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepidle));
 	if (otp)
 		tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_KEEP, 0);
  out:
@@ -542,11 +520,11 @@ tcp_timer_2msl(void *arg)
 	}
 	tcp_timer_freesack(tp);
 
-	maxidle = READ_ONCE(tcp_maxidle);
+	maxidle = TCPTV_KEEPCNT * atomic_load_int(&tcp_keepidle);
 	now = tcp_now();
 	if (tp->t_state != TCPS_TIME_WAIT &&
 	    ((maxidle == 0) || ((now - tp->t_rcvtime) <= maxidle)))
-		TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
+		TCP_TIMER_ARM(tp, TCPT_2MSL, atomic_load_int(&tcp_keepintvl));
 	else
 		tp = tcp_close(tp);
 	if (otp)
Index: netinet/tcp_timer.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
diff -u -p -r1.25 tcp_timer.h
--- netinet/tcp_timer.h	3 Jan 2025 17:23:51 -0000	1.25
+++ netinet/tcp_timer.h	15 Jan 2025 22:04:57 -0000
@@ -93,8 +93,8 @@
 #define	TCPTV_PERSMIN	TCP_TIME(5)	/* retransmit persistence */
 #define	TCPTV_PERSMAX	TCP_TIME(60)	/* maximum persist interval */
 
-#define	TCPTV_KEEP_INIT	TCP_TIME(75)	/* initial connect keep alive */
-#define	TCPTV_KEEP_IDLE	TCP_TIME(120*60) /* dflt time before probing */
+#define	TCPTV_KEEPINIT	TCP_TIME(75)	/* initial connect keep alive */
+#define	TCPTV_KEEPIDLE	TCP_TIME(120*60) /* dflt time before probing */
 #define	TCPTV_KEEPINTVL	TCP_TIME(75)	/* default probe interval */
 #define	TCPTV_KEEPCNT	8		/* max probes before drop */
 
@@ -154,16 +154,17 @@ typedef void (*tcp_timer_func_t)(void *)
 
 extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
 
-extern int tcp_delack_msecs;		/* delayed ACK timeout in millisecs */
-extern int tcptv_keep_init;
-extern int tcp_always_keepalive;	/* [a] assume SO_KEEPALIVE always set */
-extern int tcp_keepidle;		/* time before keepalive probes begin */
-extern int tcp_keepintvl;		/* time between keepalive probes */
-extern int tcp_maxidle;			/* time to drop after starting probes */
-extern int tcp_ttl;			/* time to live for TCP segs */
+extern int tcp_delack_msecs;	/* [I] delayed ACK timeout in millisecs */
+extern int tcp_always_keepalive;/* [a] assume SO_KEEPALIVE always set */
+extern int tcp_keepinit;	/* [a] time to keep alive initial SYN packet */
+extern int tcp_keepidle;	/* [a] time before keepalive probes begin */
+extern int tcp_keepintvl;	/* [a] time between keepalive probes */
+extern int tcp_keepinit_sec;	/* [a] copy of above in seconds for sysctl */
+extern int tcp_keepidle_sec;	/* [a] copy of above in seconds for sysctl */
+extern int tcp_keepintvl_sec;	/* [a] copy of above in seconds for sysctl */
+extern int tcp_ttl;		/* time to live for TCP segs */
 extern const int tcp_backoff[];
 
-void	tcp_timer_init(void);
 void	tcp_timer_reaper(void *);
 
 #endif /* _KERNEL */
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
diff -u -p -r1.239 tcp_usrreq.c
--- netinet/tcp_usrreq.c	9 Jan 2025 16:47:24 -0000	1.239
+++ netinet/tcp_usrreq.c	15 Jan 2025 22:13:17 -0000
@@ -160,6 +160,12 @@ const struct pr_usrreqs tcp6_usrreqs = {
 #endif
 
 const struct sysctl_bounded_args tcpctl_vars[] = {
+	{ TCPCTL_KEEPINITTIME, &tcp_keepinit_sec, 1,
+	    3 * TCPTV_KEEPINIT / TCP_TIME(1) },
+	{ TCPCTL_KEEPIDLE, &tcp_keepidle_sec, 1,
+	    5 * TCPTV_KEEPIDLE / TCP_TIME(1) },
+	{ TCPCTL_KEEPINTVL, &tcp_keepintvl_sec, 1,
+	    3 * TCPTV_KEEPINTVL / TCP_TIME(1) },
 	{ TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
 	{ TCPCTL_SACK, &tcp_do_sack, 0, 1 },
 	{ TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
@@ -682,7 +688,7 @@ tcp_connect(struct socket *so, struct mb
 	soisconnecting(so);
 	tcpstat_inc(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
+	TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepinit));
 	tcp_set_iss_tsm(tp);
 	tcp_sendseqinit(tp);
 	tp->snd_last = tp->snd_una;
@@ -1107,8 +1113,13 @@ tcp_usrclosed(struct tcpcb *tp)
 		 * a full close, we start a timer to make sure sockets are
 		 * not left in FIN_WAIT_2 forever.
 		 */
-		if (tp->t_state == TCPS_FIN_WAIT_2)
-			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
+		if (tp->t_state == TCPS_FIN_WAIT_2) {
+			int maxidle;
+
+			maxidle = TCPTV_KEEPCNT *
+			    atomic_load_int(&tcp_keepidle);
+			TCP_TIMER_ARM(tp, TCPT_2MSL, maxidle);
+		}
 	}
 	return (tp);
 }
@@ -1409,36 +1420,6 @@ tcp_sysctl(int *name, u_int namelen, voi
 		return (ENOTDIR);
 
 	switch (name[0]) {
-	case TCPCTL_KEEPINITTIME:
-		NET_LOCK();
-		nval = tcptv_keep_init / TCP_TIME(1);
-		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
-		    1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1)));
-		if (!error)
-			tcptv_keep_init = TCP_TIME(nval);
-		NET_UNLOCK();
-		return (error);
-
-	case TCPCTL_KEEPIDLE:
-		NET_LOCK();
-		nval = tcp_keepidle / TCP_TIME(1);
-		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
-		    1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1)));
-		if (!error)
-			tcp_keepidle = TCP_TIME(nval);
-		NET_UNLOCK();
-		return (error);
-
-	case TCPCTL_KEEPINTVL:
-		NET_LOCK();
-		nval = tcp_keepintvl / TCP_TIME(1);
-		error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
-		    1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1)));
-		if (!error)
-			tcp_keepintvl = TCP_TIME(nval);
-		NET_UNLOCK();
-		return (error);
-
 	case TCPCTL_BADDYNAMIC:
 		NET_LOCK();
 		error = sysctl_struct(oldp, oldlenp, newp, newlen,
@@ -1528,8 +1509,23 @@ tcp_sysctl(int *name, u_int namelen, voi
 		return (error);
 
 	default:
-		return sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
+		error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
 		    name, namelen, oldp, oldlenp, newp, newlen);
+		switch (name[0]) {
+		case TCPCTL_KEEPINITTIME:
+			atomic_store_int(&tcp_keepinit,
+			    atomic_load_int(&tcp_keepinit_sec) * TCP_TIME(1));
+			break;
+		case TCPCTL_KEEPIDLE:
+			atomic_store_int(&tcp_keepidle,
+			    atomic_load_int(&tcp_keepidle_sec) * TCP_TIME(1));
+			break;
+		case TCPCTL_KEEPINTVL:
+			atomic_store_int(&tcp_keepintvl,
+			    atomic_load_int(&tcp_keepintvl_sec) * TCP_TIME(1));
+			break;
+		}
+		return (error);
 	}
 	/* NOTREACHED */
 }
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.184 tcp_var.h
--- netinet/tcp_var.h	5 Jan 2025 12:18:48 -0000	1.184
+++ netinet/tcp_var.h	15 Jan 2025 20:22:23 -0000
@@ -679,7 +679,6 @@ extern	struct pool tcpcb_pool;
 extern	struct inpcbtable tcbtable, tcb6table;	/* queue of active tcpcb's */
 extern	int tcp_do_rfc1323;	/* [a] enabled/disabled? */
 extern	const int tcprexmtthresh;
-extern	int tcptv_keep_init;	/* [N] time to keep alive initial SYN packet */
 extern	int tcp_mssdflt;	/* [a] default maximum segment size */
 extern	int tcp_rst_ppslim;	/* [a] maximum outgoing RST packet per second */
 extern	int tcp_ack_on_push;	/* [a] ACK immediately on PUSH */