Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: tcp output MP fixes
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Wed, 6 Nov 2024 13:25:54 +0100

Download raw body.

Thread
On Wed, Nov 06, 2024 at 02:40:16PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Nov 06, 2024 at 11:50:03AM +0100, Alexander Bluhm wrote:
> > Hi,
> > 
> > Next goal is running TCP socket layer with shared netlock.  I went
> > through tcp_output() and looked for MP races.  Mostly documentation
> > and small fixes.
> > 
> > I am not sure about sbchecklowmem().  For MP safety we must serialize
> > calls to m_pool_used() and protect sblowmem with a mutex.  In
> > practice not much can ever go wrong.  I have chosen mtx_enter_try()
> > to update without spinning and using a possibly obsololete value
> > in case of lock contention.
> > 
> > ok?
> > 
> 
> Could you keep locking description with variables definition? We could
> have many places where variable declared as extern, so each of them
> should have copy-pasted comment. Imagine you need to keep them in sync.

In an ideal world, global variables one extern declaration in a .h
file, and a definitiion in one .c file.  Or variable is only used
in one .c file.

I would prefer that every global variable is in one .h file.
Declaring extern here and there makes consistency difficult.

In our code base, we have a mixture of everything.  Something is
commented in .h, others in .c, or duplicate comments in .c and .h
or no comments at all.

I saw that ip_input.c have no comments at all, and ip_var.h contained
comments, so I tried to move the locking comments there.  Also the
.h file is the public API description, which is an argument to put
the comments there.

Do you think we should move all [IaN] comments to the .c file?  Even
if everything else is in the .h file?  Struct fields are usually
in the .h file only.

My approach would be to look at existing comments, clean up a bit,
and add where it matches.  And my impression was that ip_var.h
matches better.  But I am fine with other solutions.

bluhm

> > Index: kern/uipc_socket2.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
> > diff -u -p -r1.158 uipc_socket2.c
> > --- kern/uipc_socket2.c	12 Jul 2024 19:50:35 -0000	1.158
> > +++ kern/uipc_socket2.c	6 Nov 2024 10:42:27 -0000
> > @@ -49,6 +49,9 @@
> >  
> >  u_long	sb_max = SB_MAX;		/* patchable */
> >  
> > +struct mutex sblowmem_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
> > +int sblowmem;
> > +
> >  extern struct pool mclpools[];
> >  extern struct pool mbpool;
> >  
> > @@ -683,15 +674,24 @@ sbcheckreserve(u_long cnt, u_long defcnt
> >  int
> >  sbchecklowmem(void)
> >  {
> > -	static int sblowmem;
> > -	unsigned int used = m_pool_used();
> > +	int lowmem;
> > +	unsigned int used;
> >  
> > +	if (mtx_enter_try(&sblowmem_mtx) == 0) {
> > +		/* another thread is updating sblowmem, use old or new value */
> > +		return (sblowmem);
> > +	}
> > +
> > +	used = m_pool_used();
> >  	if (used < 60)
> >  		sblowmem = 0;
> >  	else if (used > 80)
> >  		sblowmem = 1;
> > +	lowmem = sblowmem;
> > +
> > +	mtx_leave(&sblowmem_mtx);
> >  
> > -	return (sblowmem);
> > +	return (lowmem);
> >  }
> >  
> >  /*
> > Index: netinet/in_pcb.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> > diff -u -p -r1.159 in_pcb.h
> > --- netinet/in_pcb.h	5 Nov 2024 10:49:23 -0000	1.159
> > +++ netinet/in_pcb.h	5 Nov 2024 23:18:13 -0000
> > @@ -140,7 +140,7 @@ struct inpcb {
> >  	u_int16_t inp_lport;		/* [t] local port */
> >  	struct	  socket *inp_socket;	/* [I] back pointer to socket */
> >  	caddr_t	  inp_ppcb;		/* pointer to per-protocol pcb */
> > -	struct    route inp_route;	/* cached route */
> > +	struct    route inp_route;	/* [s] cached route */
> >  	struct    refcnt inp_refcnt;	/* refcount PCB, delay memory free */
> >  	int	  inp_flags;		/* generic IP/datagram flags */
> >  	union {				/* Header prototype. */
> > Index: netinet/ip_input.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> > diff -u -p -r1.401 ip_input.c
> > --- netinet/ip_input.c	6 Aug 2024 16:56:09 -0000	1.401
> > +++ netinet/ip_input.c	5 Nov 2024 23:18:13 -0000
> > @@ -83,23 +83,16 @@
> >  #include <netinet/ip_carp.h>
> >  #endif
> >  
> > -/*
> > - * Locks used to protect global variables in this file:
> > - *	I	immutable after creation
> > - *	a	atomic operations
> > - *	N	net lock
> > - */
> > -
> >  /* values controllable via sysctl */
> > -int	ip_forwarding = 0;			/* [a] */
> > +int	ip_forwarding = 0;
> >  int	ipmforwarding = 0;
> >  int	ipmultipath = 0;
> > -int	ip_sendredirects = 1;			/* [a] */
> > +int	ip_sendredirects = 1;
> >  int	ip_dosourceroute = 0;
> >  int	ip_defttl = IPDEFTTL;
> >  int	ip_mtudisc = 1;
> >  int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
> > -int	ip_directedbcast = 0;			/* [a] */
> > +int	ip_directedbcast = 0;
> >  
> >  /* Protects `ipq' and `ip_frags'. */
> >  struct mutex	ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET);
> > Index: netinet/ip_var.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> > diff -u -p -r1.120 ip_var.h
> > --- netinet/ip_var.h	12 Jul 2024 19:50:35 -0000	1.120
> > +++ netinet/ip_var.h	5 Nov 2024 23:18:13 -0000
> > @@ -36,6 +36,13 @@
> >  #define _NETINET_IP_VAR_H_
> >  
> >  /*
> > + * Locks used to protect global variables in this file:
> > + *	I	immutable after creation
> > + *	a	atomic operations
> > + *	N	net lock
> > + */
> > +
> > +/*
> >   * Structure stored in mbuf in inpcb.ip_options
> >   * and passed to ip_output when ip options are in use.
> >   * The actual length of the options (including ipopt_dst)
> > @@ -216,19 +223,20 @@ extern int ip_defttl;			/* default IP tt
> >  
> >  #define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
> >  
> > -extern int ip_mtudisc;			/* mtu discovery */
> > +extern int ip_mtudisc;			/* [N] mtu discovery */
> >  extern int ip_mtudisc_timeout;		/* seconds to timeout mtu discovery */
> >  
> >  extern int ipport_firstauto;		/* min port for port allocation */
> >  extern int ipport_lastauto;		/* max port for port allocation */
> >  extern int ipport_hifirstauto;		/* min dynamic/private port number */
> >  extern int ipport_hilastauto;		/* max dynamic/private port number */
> > -extern int ip_forwarding;		/* enable IP forwarding */
> > +extern int ip_forwarding;		/* [a] enable IP forwarding */
> >  #ifdef MROUTING
> >  extern int ipmforwarding;		/* enable multicast forwarding */
> >  #endif
> >  extern int ipmultipath;			/* enable multipath routing */
> > -extern int ip_directedbcast;		/* accept all broadcast packets */
> > +extern int ip_sendredirects;		/* [a] send icmp redirect while forwd */
> > +extern int ip_directedbcast;		/* [a] accept all broadcast packets */
> >  extern unsigned int la_hold_total;
> >  
> >  extern const struct pr_usrreqs rip_usrreqs;
> > Index: netinet/tcp_fsm.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_fsm.h,v
> > diff -u -p -r1.9 tcp_fsm.h
> > --- netinet/tcp_fsm.h	5 Feb 2018 14:53:26 -0000	1.9
> > +++ netinet/tcp_fsm.h	5 Nov 2024 23:18:13 -0000
> > @@ -68,7 +68,7 @@
> >   * determined by state, with the proviso that TH_FIN is sent only
> >   * if all data queued for output is included in the segment.
> >   */
> > -u_char	tcp_outflags[TCP_NSTATES] = {
> > +const u_char tcp_outflags[TCP_NSTATES] = {
> >      TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
> >      TH_ACK, TH_ACK,
> >      TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK,
> > @@ -76,7 +76,7 @@ u_char	tcp_outflags[TCP_NSTATES] = {
> >  #endif /* TCPOUTFLAGS */
> >  
> >  #ifdef	TCPSTATES
> > -const char *tcpstates[] = {
> > +const char *const tcpstates[] = {
> >  	"CLOSED",	"LISTEN",	"SYN_SENT",	"SYN_RCVD",
> >  	"ESTABLISHED",	"CLOSE_WAIT",	"FIN_WAIT_1",	"CLOSING",
> >  	"LAST_ACK",	"FIN_WAIT_2",	"TIME_WAIT",
> > Index: netinet/tcp_input.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> > diff -u -p -r1.407 tcp_input.c
> > --- netinet/tcp_input.c	26 Aug 2024 13:55:14 -0000	1.407
> > +++ netinet/tcp_input.c	5 Nov 2024 23:18:13 -0000
> > @@ -2788,11 +2788,11 @@ tcp_xmit_timer(struct tcpcb *tp, int32_t
> >  int
> >  tcp_mss(struct tcpcb *tp, int offer)
> >  {
> > +	struct inpcb *inp;
> >  	struct rtentry *rt;
> >  	struct ifnet *ifp = NULL;
> > -	int mss, mssopt;
> > -	int iphlen;
> > -	struct inpcb *inp;
> > +	int mss, mssopt, iphlen, do_rfc3390;
> > +	u_int rtmtu;
> >  
> >  	inp = tp->t_inpcb;
> >  
> > @@ -2825,12 +2825,13 @@ tcp_mss(struct tcpcb *tp, int offer)
> >  	 * if there's an mtu associated with the route and we support
> >  	 * path MTU discovery for the underlying protocol family, use it.
> >  	 */
> > -	if (rt->rt_mtu) {
> > +	rtmtu = atomic_load_int(&rt->rt_mtu);
> > +	if (rtmtu) {
> >  		/*
> >  		 * One may wish to lower MSS to take into account options,
> >  		 * especially security-related options.
> >  		 */
> > -		if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
> > +		if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) {
> >  			/*
> >  			 * RFC2460 section 5, last paragraph: if path MTU is
> >  			 * smaller than 1280, use 1280 as packet size and
> > @@ -2839,8 +2840,7 @@ tcp_mss(struct tcpcb *tp, int offer)
> >  			mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
> >  			    sizeof(struct tcphdr);
> >  		} else {
> > -			mss = rt->rt_mtu - iphlen -
> > -			    sizeof(struct tcphdr);
> > +			mss = rtmtu - iphlen - sizeof(struct tcphdr);
> >  		}
> >  	} else if (ifp->if_flags & IFF_LOOPBACK) {
> >  		mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
> > @@ -2900,6 +2900,7 @@ tcp_mss(struct tcpcb *tp, int offer)
> >  		mss -= TCPOLEN_SIGLEN;
> >  #endif
> >  
> > +	do_rfc3390 = atomic_load_int(&tcp_do_rfc3390);
> >  	if (offer == -1) {
> >  		/* mss changed due to Path MTU discovery */
> >  		tp->t_flags &= ~TF_PMTUD_PEND;
> > @@ -2914,10 +2915,10 @@ tcp_mss(struct tcpcb *tp, int offer)
> >  			tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
> >  			    mss, mss);
> >  		}
> > -	} else if (tcp_do_rfc3390 == 2) {
> > +	} else if (do_rfc3390 == 2) {
> >  		/* increase initial window  */
> >  		tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
> > -	} else if (tcp_do_rfc3390) {
> > +	} else if (do_rfc3390) {
> >  		/* increase initial window  */
> >  		tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
> >  	} else
> > Index: netinet/tcp_output.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
> > diff -u -p -r1.145 tcp_output.c
> > --- netinet/tcp_output.c	14 May 2024 09:39:02 -0000	1.145
> > +++ netinet/tcp_output.c	5 Nov 2024 23:18:13 -0000
> > @@ -103,8 +103,6 @@
> >  extern struct mbuf *m_copypack();
> >  #endif
> >  
> > -extern int tcprexmtthresh;
> > -
> >  #ifdef TCP_SACK_DEBUG
> >  void tcp_print_holes(struct tcpcb *tp);
> >  
> > @@ -350,7 +348,7 @@ again:
> >  	txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
> >  
> >  	if (len > txmaxseg) {
> > -		if (tcp_do_tso &&
> > +		if (atomic_load_int(&tcp_do_tso) &&
> >  		    tp->t_inpcb->inp_options == NULL &&
> >  		    tp->t_inpcb->inp_outputopts6 == NULL &&
> >  #ifdef TCP_SIGNATURE
> > Index: netinet/tcp_timer.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
> > diff -u -p -r1.76 tcp_timer.c
> > --- netinet/tcp_timer.c	28 Jan 2024 20:34:25 -0000	1.76
> > +++ netinet/tcp_timer.c	5 Nov 2024 23:18:13 -0000
> > @@ -167,10 +167,10 @@ tcp_canceltimers(struct tcpcb *tp)
> >  		TCP_TIMER_DISARM(tp, i);
> >  }
> >  
> > -int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
> > +const int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
> >      { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
> >  
> > -int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
> > +const int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
> >  
> >  /*
> >   * TCP timer processing.
> > Index: netinet/tcp_timer.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
> > diff -u -p -r1.21 tcp_timer.h
> > --- netinet/tcp_timer.h	29 Jan 2024 22:47:13 -0000	1.21
> > +++ netinet/tcp_timer.h	5 Nov 2024 23:18:13 -0000
> > @@ -154,13 +154,12 @@ typedef void (*tcp_timer_func_t)(void *)
> >  extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
> >  
> >  extern int tcp_delack_msecs;		/* delayed ACK timeout in millisecs */
> > -extern int tcptv_keep_init;
> >  extern int tcp_always_keepalive;	/* assume SO_KEEPALIVE is always set */
> >  extern int tcp_keepidle;		/* time before keepalive probes begin */
> >  extern int tcp_keepintvl;		/* time between keepalive probes */
> >  extern int tcp_maxidle;			/* time to drop after starting probes */
> >  extern int tcp_ttl;			/* time to live for TCP segs */
> > -extern int tcp_backoff[];
> > +extern const int tcp_backoff[];
> >  
> >  void	tcp_timer_init(void);
> >  #endif /* _KERNEL */
> > Index: netinet/tcp_usrreq.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
> > diff -u -p -r1.231 tcp_usrreq.c
> > --- netinet/tcp_usrreq.c	12 Apr 2024 16:07:09 -0000	1.231
> > +++ netinet/tcp_usrreq.c	5 Nov 2024 23:18:13 -0000
> > @@ -105,12 +105,12 @@
> >  #ifndef TCP_SENDSPACE
> >  #define	TCP_SENDSPACE	1024*16
> >  #endif
> > -u_int	tcp_sendspace = TCP_SENDSPACE;
> > +u_int	tcp_sendspace = TCP_SENDSPACE;		/* [I] */
> >  #ifndef TCP_RECVSPACE
> >  #define	TCP_RECVSPACE	1024*16
> >  #endif
> > -u_int	tcp_recvspace = TCP_RECVSPACE;
> > -u_int	tcp_autorcvbuf_inc = 16 * 1024;
> > +u_int	tcp_recvspace = TCP_RECVSPACE;		/* [I] */
> > +u_int	tcp_autorcvbuf_inc = 16 * 1024;		/* [I] */
> >  
> >  const struct pr_usrreqs tcp_usrreqs = {
> >  	.pru_attach	= tcp_attach,
> > @@ -1516,13 +1516,14 @@ tcp_update_sndspace(struct tcpcb *tp)
> >  		/* low on memory try to get rid of some */
> >  		if (tcp_sendspace < nmax)
> >  			nmax = tcp_sendspace;
> > -	} else if (so->so_snd.sb_wat != tcp_sendspace)
> > +	} else if (so->so_snd.sb_wat != tcp_sendspace) {
> >  		/* user requested buffer size, auto-scaling disabled */
> >  		nmax = so->so_snd.sb_wat;
> > -	else
> > +	} else {
> >  		/* automatic buffer scaling */
> >  		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
> >  		    tp->snd_una);
> > +	}
> >  
> >  	/* a writable socket must be preserved because of poll(2) semantics */
> >  	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
> > @@ -1556,10 +1557,10 @@ tcp_update_rcvspace(struct tcpcb *tp)
> >  		/* low on memory try to get rid of some */
> >  		if (tcp_recvspace < nmax)
> >  			nmax = tcp_recvspace;
> > -	} else if (so->so_rcv.sb_wat != tcp_recvspace)
> > +	} else if (so->so_rcv.sb_wat != tcp_recvspace) {
> >  		/* user requested buffer size, auto-scaling disabled */
> >  		nmax = so->so_rcv.sb_wat;
> > -	else {
> > +	} else {
> >  		/* automatic buffer scaling */
> >  		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
> >  			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
> > Index: netinet/tcp_var.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> > diff -u -p -r1.178 tcp_var.h
> > --- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
> > +++ netinet/tcp_var.h	5 Nov 2024 23:18:13 -0000
> > @@ -228,6 +228,7 @@ struct tcp_opt_info {
> >  /*
> >   * Locks used to protect global data and struct members:
> >   *	I	immutable after creation
> > + *	a	atomic operations
> >   *	N	net lock
> >   *	S	syn_cache_mtx		tcp syn cache global mutex
> >   */
> > @@ -678,16 +679,17 @@ extern	const struct pr_usrreqs tcp6_usrr
> >  extern	struct pool tcpcb_pool;
> >  extern	struct inpcbtable tcbtable, tcb6table;	/* queue of active tcpcb's */
> >  extern	int tcp_do_rfc1323;	/* enabled/disabled? */
> > +extern	int tcprexmtthresh;	/* [I] */
> >  extern	int tcptv_keep_init;	/* [N] time to keep alive initial SYN packet */
> > -extern	int tcp_mssdflt;	/* default maximum segment size */
> > +extern	int tcp_mssdflt;	/* [N] default maximum segment size */
> >  extern	int tcp_rst_ppslim;	/* maximum outgoing RST packet per second */
> >  extern	int tcp_ack_on_push;	/* ACK immediately on PUSH */
> >  extern	int tcp_do_sack;	/* SACK enabled/disabled */
> >  extern	struct pool sackhl_pool;
> >  extern	int tcp_sackhole_limit;	/* max entries for tcp sack queues */
> > -extern	int tcp_do_ecn;		/* RFC3168 ECN enabled/disabled? */
> > -extern	int tcp_do_rfc3390;	/* RFC3390 Increasing TCP's Initial Window */
> > -extern	int tcp_do_tso;		/* enable TSO for TCP output packets */
> > +extern	int tcp_do_ecn;		/* [N] RFC3168 ECN enabled/disabled? */
> > +extern	int tcp_do_rfc3390;	/* [a] RFC3390 Increasing TCP Initial Window */
> > +extern	int tcp_do_tso;		/* [a] enable TSO for TCP output packets */
> >  
> >  extern	struct pool tcpqe_pool;
> >  extern	int tcp_reass_limit;	/* max entries for tcp reass queues */
> > Index: sys/mbuf.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
> > diff -u -p -r1.265 mbuf.h
> > --- sys/mbuf.h	5 Nov 2024 13:15:13 -0000	1.265
> > +++ sys/mbuf.h	5 Nov 2024 23:18:13 -0000
> > @@ -411,7 +411,7 @@ struct mbuf_queue {
> >  struct pool;
> >  
> >  extern	long nmbclust;			/* limit on the # of clusters */
> > -extern	int max_linkhdr;		/* largest link-level header */
> > +extern	int max_linkhdr;		/* [I] largest link-level header */
> >  extern	int max_protohdr;		/* largest protocol header */
> >  extern	int max_hdr;			/* largest link+protocol header */
> >  extern	struct cpumem *mbstat;		/* mbuf statistics counter */
> > Index: sys/socketvar.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> > diff -u -p -r1.134 socketvar.h
> > --- sys/socketvar.h	9 Sep 2024 07:38:45 -0000	1.134
> > +++ sys/socketvar.h	5 Nov 2024 23:18:13 -0000
> > @@ -376,7 +376,7 @@ sbassertlocked(struct sockbuf *sb)
> >  	}								\
> >  } while (/*CONSTCOND*/0)
> >  
> > -extern u_long sb_max;
> > +extern u_long sb_max;			/* [I] */
> >  
> >  extern struct pool	socket_pool;
> >  
> >