Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: tcp output MP fixes
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Wed, 6 Nov 2024 14:40:16 +0300

Download raw body.

Thread
On Wed, Nov 06, 2024 at 11:50:03AM +0100, Alexander Bluhm wrote:
> Hi,
> 
> Next goal is running TCP socket layer with shared netlock.  I went
> through tcp_output() and looked for MP races.  Mostly documentation
> and small fixes.
> 
> I am not sure about sbchecklowmem().  For MP safety we must serialize
> calls to m_pool_used() and protect sblowmem with a mutex.  In
> practice not much can ever go wrong.  I have chosen mtx_enter_try()
> to update without spinning and using a possibly obsololete value
> in case of lock contention.
> 
> ok?
> 

Could you keep locking description with variables definition? We could
have many places where variable declared as extern, so each of them
should have copy-pasted comment. Imagine you need to keep them in sync.

> bluhm
> 
> Index: kern/uipc_socket2.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
> diff -u -p -r1.158 uipc_socket2.c
> --- kern/uipc_socket2.c	12 Jul 2024 19:50:35 -0000	1.158
> +++ kern/uipc_socket2.c	6 Nov 2024 10:42:27 -0000
> @@ -49,6 +49,9 @@
>  
>  u_long	sb_max = SB_MAX;		/* patchable */
>  
> +struct mutex sblowmem_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
> +int sblowmem;
> +
>  extern struct pool mclpools[];
>  extern struct pool mbpool;
>  
> @@ -683,15 +674,24 @@ sbcheckreserve(u_long cnt, u_long defcnt
>  int
>  sbchecklowmem(void)
>  {
> -	static int sblowmem;
> -	unsigned int used = m_pool_used();
> +	int lowmem;
> +	unsigned int used;
>  
> +	if (mtx_enter_try(&sblowmem_mtx) == 0) {
> +		/* another thread is updating sblowmem, use old or new value */
> +		return (sblowmem);
> +	}
> +
> +	used = m_pool_used();
>  	if (used < 60)
>  		sblowmem = 0;
>  	else if (used > 80)
>  		sblowmem = 1;
> +	lowmem = sblowmem;
> +
> +	mtx_leave(&sblowmem_mtx);
>  
> -	return (sblowmem);
> +	return (lowmem);
>  }
>  
>  /*
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> diff -u -p -r1.159 in_pcb.h
> --- netinet/in_pcb.h	5 Nov 2024 10:49:23 -0000	1.159
> +++ netinet/in_pcb.h	5 Nov 2024 23:18:13 -0000
> @@ -140,7 +140,7 @@ struct inpcb {
>  	u_int16_t inp_lport;		/* [t] local port */
>  	struct	  socket *inp_socket;	/* [I] back pointer to socket */
>  	caddr_t	  inp_ppcb;		/* pointer to per-protocol pcb */
> -	struct    route inp_route;	/* cached route */
> +	struct    route inp_route;	/* [s] cached route */
>  	struct    refcnt inp_refcnt;	/* refcount PCB, delay memory free */
>  	int	  inp_flags;		/* generic IP/datagram flags */
>  	union {				/* Header prototype. */
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> diff -u -p -r1.401 ip_input.c
> --- netinet/ip_input.c	6 Aug 2024 16:56:09 -0000	1.401
> +++ netinet/ip_input.c	5 Nov 2024 23:18:13 -0000
> @@ -83,23 +83,16 @@
>  #include <netinet/ip_carp.h>
>  #endif
>  
> -/*
> - * Locks used to protect global variables in this file:
> - *	I	immutable after creation
> - *	a	atomic operations
> - *	N	net lock
> - */
> -
>  /* values controllable via sysctl */
> -int	ip_forwarding = 0;			/* [a] */
> +int	ip_forwarding = 0;
>  int	ipmforwarding = 0;
>  int	ipmultipath = 0;
> -int	ip_sendredirects = 1;			/* [a] */
> +int	ip_sendredirects = 1;
>  int	ip_dosourceroute = 0;
>  int	ip_defttl = IPDEFTTL;
>  int	ip_mtudisc = 1;
>  int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
> -int	ip_directedbcast = 0;			/* [a] */
> +int	ip_directedbcast = 0;
>  
>  /* Protects `ipq' and `ip_frags'. */
>  struct mutex	ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET);
> Index: netinet/ip_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> diff -u -p -r1.120 ip_var.h
> --- netinet/ip_var.h	12 Jul 2024 19:50:35 -0000	1.120
> +++ netinet/ip_var.h	5 Nov 2024 23:18:13 -0000
> @@ -36,6 +36,13 @@
>  #define _NETINET_IP_VAR_H_
>  
>  /*
> + * Locks used to protect global variables in this file:
> + *	I	immutable after creation
> + *	a	atomic operations
> + *	N	net lock
> + */
> +
> +/*
>   * Structure stored in mbuf in inpcb.ip_options
>   * and passed to ip_output when ip options are in use.
>   * The actual length of the options (including ipopt_dst)
> @@ -216,19 +223,20 @@ extern int ip_defttl;			/* default IP tt
>  
>  #define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
>  
> -extern int ip_mtudisc;			/* mtu discovery */
> +extern int ip_mtudisc;			/* [N] mtu discovery */
>  extern int ip_mtudisc_timeout;		/* seconds to timeout mtu discovery */
>  
>  extern int ipport_firstauto;		/* min port for port allocation */
>  extern int ipport_lastauto;		/* max port for port allocation */
>  extern int ipport_hifirstauto;		/* min dynamic/private port number */
>  extern int ipport_hilastauto;		/* max dynamic/private port number */
> -extern int ip_forwarding;		/* enable IP forwarding */
> +extern int ip_forwarding;		/* [a] enable IP forwarding */
>  #ifdef MROUTING
>  extern int ipmforwarding;		/* enable multicast forwarding */
>  #endif
>  extern int ipmultipath;			/* enable multipath routing */
> -extern int ip_directedbcast;		/* accept all broadcast packets */
> +extern int ip_sendredirects;		/* [a] send icmp redirect while forwd */
> +extern int ip_directedbcast;		/* [a] accept all broadcast packets */
>  extern unsigned int la_hold_total;
>  
>  extern const struct pr_usrreqs rip_usrreqs;
> Index: netinet/tcp_fsm.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_fsm.h,v
> diff -u -p -r1.9 tcp_fsm.h
> --- netinet/tcp_fsm.h	5 Feb 2018 14:53:26 -0000	1.9
> +++ netinet/tcp_fsm.h	5 Nov 2024 23:18:13 -0000
> @@ -68,7 +68,7 @@
>   * determined by state, with the proviso that TH_FIN is sent only
>   * if all data queued for output is included in the segment.
>   */
> -u_char	tcp_outflags[TCP_NSTATES] = {
> +const u_char tcp_outflags[TCP_NSTATES] = {
>      TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
>      TH_ACK, TH_ACK,
>      TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK,
> @@ -76,7 +76,7 @@ u_char	tcp_outflags[TCP_NSTATES] = {
>  #endif /* TCPOUTFLAGS */
>  
>  #ifdef	TCPSTATES
> -const char *tcpstates[] = {
> +const char *const tcpstates[] = {
>  	"CLOSED",	"LISTEN",	"SYN_SENT",	"SYN_RCVD",
>  	"ESTABLISHED",	"CLOSE_WAIT",	"FIN_WAIT_1",	"CLOSING",
>  	"LAST_ACK",	"FIN_WAIT_2",	"TIME_WAIT",
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.407 tcp_input.c
> --- netinet/tcp_input.c	26 Aug 2024 13:55:14 -0000	1.407
> +++ netinet/tcp_input.c	5 Nov 2024 23:18:13 -0000
> @@ -2788,11 +2788,11 @@ tcp_xmit_timer(struct tcpcb *tp, int32_t
>  int
>  tcp_mss(struct tcpcb *tp, int offer)
>  {
> +	struct inpcb *inp;
>  	struct rtentry *rt;
>  	struct ifnet *ifp = NULL;
> -	int mss, mssopt;
> -	int iphlen;
> -	struct inpcb *inp;
> +	int mss, mssopt, iphlen, do_rfc3390;
> +	u_int rtmtu;
>  
>  	inp = tp->t_inpcb;
>  
> @@ -2825,12 +2825,13 @@ tcp_mss(struct tcpcb *tp, int offer)
>  	 * if there's an mtu associated with the route and we support
>  	 * path MTU discovery for the underlying protocol family, use it.
>  	 */
> -	if (rt->rt_mtu) {
> +	rtmtu = atomic_load_int(&rt->rt_mtu);
> +	if (rtmtu) {
>  		/*
>  		 * One may wish to lower MSS to take into account options,
>  		 * especially security-related options.
>  		 */
> -		if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
> +		if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) {
>  			/*
>  			 * RFC2460 section 5, last paragraph: if path MTU is
>  			 * smaller than 1280, use 1280 as packet size and
> @@ -2839,8 +2840,7 @@ tcp_mss(struct tcpcb *tp, int offer)
>  			mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
>  			    sizeof(struct tcphdr);
>  		} else {
> -			mss = rt->rt_mtu - iphlen -
> -			    sizeof(struct tcphdr);
> +			mss = rtmtu - iphlen - sizeof(struct tcphdr);
>  		}
>  	} else if (ifp->if_flags & IFF_LOOPBACK) {
>  		mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
> @@ -2900,6 +2900,7 @@ tcp_mss(struct tcpcb *tp, int offer)
>  		mss -= TCPOLEN_SIGLEN;
>  #endif
>  
> +	do_rfc3390 = atomic_load_int(&tcp_do_rfc3390);
>  	if (offer == -1) {
>  		/* mss changed due to Path MTU discovery */
>  		tp->t_flags &= ~TF_PMTUD_PEND;
> @@ -2914,10 +2915,10 @@ tcp_mss(struct tcpcb *tp, int offer)
>  			tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
>  			    mss, mss);
>  		}
> -	} else if (tcp_do_rfc3390 == 2) {
> +	} else if (do_rfc3390 == 2) {
>  		/* increase initial window  */
>  		tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
> -	} else if (tcp_do_rfc3390) {
> +	} else if (do_rfc3390) {
>  		/* increase initial window  */
>  		tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
>  	} else
> Index: netinet/tcp_output.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
> diff -u -p -r1.145 tcp_output.c
> --- netinet/tcp_output.c	14 May 2024 09:39:02 -0000	1.145
> +++ netinet/tcp_output.c	5 Nov 2024 23:18:13 -0000
> @@ -103,8 +103,6 @@
>  extern struct mbuf *m_copypack();
>  #endif
>  
> -extern int tcprexmtthresh;
> -
>  #ifdef TCP_SACK_DEBUG
>  void tcp_print_holes(struct tcpcb *tp);
>  
> @@ -350,7 +348,7 @@ again:
>  	txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
>  
>  	if (len > txmaxseg) {
> -		if (tcp_do_tso &&
> +		if (atomic_load_int(&tcp_do_tso) &&
>  		    tp->t_inpcb->inp_options == NULL &&
>  		    tp->t_inpcb->inp_outputopts6 == NULL &&
>  #ifdef TCP_SIGNATURE
> Index: netinet/tcp_timer.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
> diff -u -p -r1.76 tcp_timer.c
> --- netinet/tcp_timer.c	28 Jan 2024 20:34:25 -0000	1.76
> +++ netinet/tcp_timer.c	5 Nov 2024 23:18:13 -0000
> @@ -167,10 +167,10 @@ tcp_canceltimers(struct tcpcb *tp)
>  		TCP_TIMER_DISARM(tp, i);
>  }
>  
> -int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
> +const int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
>      { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
>  
> -int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
> +const int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
>  
>  /*
>   * TCP timer processing.
> Index: netinet/tcp_timer.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
> diff -u -p -r1.21 tcp_timer.h
> --- netinet/tcp_timer.h	29 Jan 2024 22:47:13 -0000	1.21
> +++ netinet/tcp_timer.h	5 Nov 2024 23:18:13 -0000
> @@ -154,13 +154,12 @@ typedef void (*tcp_timer_func_t)(void *)
>  extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
>  
>  extern int tcp_delack_msecs;		/* delayed ACK timeout in millisecs */
> -extern int tcptv_keep_init;
>  extern int tcp_always_keepalive;	/* assume SO_KEEPALIVE is always set */
>  extern int tcp_keepidle;		/* time before keepalive probes begin */
>  extern int tcp_keepintvl;		/* time between keepalive probes */
>  extern int tcp_maxidle;			/* time to drop after starting probes */
>  extern int tcp_ttl;			/* time to live for TCP segs */
> -extern int tcp_backoff[];
> +extern const int tcp_backoff[];
>  
>  void	tcp_timer_init(void);
>  #endif /* _KERNEL */
> Index: netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
> diff -u -p -r1.231 tcp_usrreq.c
> --- netinet/tcp_usrreq.c	12 Apr 2024 16:07:09 -0000	1.231
> +++ netinet/tcp_usrreq.c	5 Nov 2024 23:18:13 -0000
> @@ -105,12 +105,12 @@
>  #ifndef TCP_SENDSPACE
>  #define	TCP_SENDSPACE	1024*16
>  #endif
> -u_int	tcp_sendspace = TCP_SENDSPACE;
> +u_int	tcp_sendspace = TCP_SENDSPACE;		/* [I] */
>  #ifndef TCP_RECVSPACE
>  #define	TCP_RECVSPACE	1024*16
>  #endif
> -u_int	tcp_recvspace = TCP_RECVSPACE;
> -u_int	tcp_autorcvbuf_inc = 16 * 1024;
> +u_int	tcp_recvspace = TCP_RECVSPACE;		/* [I] */
> +u_int	tcp_autorcvbuf_inc = 16 * 1024;		/* [I] */
>  
>  const struct pr_usrreqs tcp_usrreqs = {
>  	.pru_attach	= tcp_attach,
> @@ -1516,13 +1516,14 @@ tcp_update_sndspace(struct tcpcb *tp)
>  		/* low on memory try to get rid of some */
>  		if (tcp_sendspace < nmax)
>  			nmax = tcp_sendspace;
> -	} else if (so->so_snd.sb_wat != tcp_sendspace)
> +	} else if (so->so_snd.sb_wat != tcp_sendspace) {
>  		/* user requested buffer size, auto-scaling disabled */
>  		nmax = so->so_snd.sb_wat;
> -	else
> +	} else {
>  		/* automatic buffer scaling */
>  		nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
>  		    tp->snd_una);
> +	}
>  
>  	/* a writable socket must be preserved because of poll(2) semantics */
>  	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
> @@ -1556,10 +1557,10 @@ tcp_update_rcvspace(struct tcpcb *tp)
>  		/* low on memory try to get rid of some */
>  		if (tcp_recvspace < nmax)
>  			nmax = tcp_recvspace;
> -	} else if (so->so_rcv.sb_wat != tcp_recvspace)
> +	} else if (so->so_rcv.sb_wat != tcp_recvspace) {
>  		/* user requested buffer size, auto-scaling disabled */
>  		nmax = so->so_rcv.sb_wat;
> -	else {
> +	} else {
>  		/* automatic buffer scaling */
>  		if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
>  			nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.178 tcp_var.h
> --- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
> +++ netinet/tcp_var.h	5 Nov 2024 23:18:13 -0000
> @@ -228,6 +228,7 @@ struct tcp_opt_info {
>  /*
>   * Locks used to protect global data and struct members:
>   *	I	immutable after creation
> + *	a	atomic operations
>   *	N	net lock
>   *	S	syn_cache_mtx		tcp syn cache global mutex
>   */
> @@ -678,16 +679,17 @@ extern	const struct pr_usrreqs tcp6_usrr
>  extern	struct pool tcpcb_pool;
>  extern	struct inpcbtable tcbtable, tcb6table;	/* queue of active tcpcb's */
>  extern	int tcp_do_rfc1323;	/* enabled/disabled? */
> +extern	int tcprexmtthresh;	/* [I] */
>  extern	int tcptv_keep_init;	/* [N] time to keep alive initial SYN packet */
> -extern	int tcp_mssdflt;	/* default maximum segment size */
> +extern	int tcp_mssdflt;	/* [N] default maximum segment size */
>  extern	int tcp_rst_ppslim;	/* maximum outgoing RST packet per second */
>  extern	int tcp_ack_on_push;	/* ACK immediately on PUSH */
>  extern	int tcp_do_sack;	/* SACK enabled/disabled */
>  extern	struct pool sackhl_pool;
>  extern	int tcp_sackhole_limit;	/* max entries for tcp sack queues */
> -extern	int tcp_do_ecn;		/* RFC3168 ECN enabled/disabled? */
> -extern	int tcp_do_rfc3390;	/* RFC3390 Increasing TCP's Initial Window */
> -extern	int tcp_do_tso;		/* enable TSO for TCP output packets */
> +extern	int tcp_do_ecn;		/* [N] RFC3168 ECN enabled/disabled? */
> +extern	int tcp_do_rfc3390;	/* [a] RFC3390 Increasing TCP Initial Window */
> +extern	int tcp_do_tso;		/* [a] enable TSO for TCP output packets */
>  
>  extern	struct pool tcpqe_pool;
>  extern	int tcp_reass_limit;	/* max entries for tcp reass queues */
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.265 mbuf.h
> --- sys/mbuf.h	5 Nov 2024 13:15:13 -0000	1.265
> +++ sys/mbuf.h	5 Nov 2024 23:18:13 -0000
> @@ -411,7 +411,7 @@ struct mbuf_queue {
>  struct pool;
>  
>  extern	long nmbclust;			/* limit on the # of clusters */
> -extern	int max_linkhdr;		/* largest link-level header */
> +extern	int max_linkhdr;		/* [I] largest link-level header */
>  extern	int max_protohdr;		/* largest protocol header */
>  extern	int max_hdr;			/* largest link+protocol header */
>  extern	struct cpumem *mbstat;		/* mbuf statistics counter */
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> diff -u -p -r1.134 socketvar.h
> --- sys/socketvar.h	9 Sep 2024 07:38:45 -0000	1.134
> +++ sys/socketvar.h	5 Nov 2024 23:18:13 -0000
> @@ -376,7 +376,7 @@ sbassertlocked(struct sockbuf *sb)
>  	}								\
>  } while (/*CONSTCOND*/0)
>  
> -extern u_long sb_max;
> +extern u_long sb_max;			/* [I] */
>  
>  extern struct pool	socket_pool;
>  
>