From: Vitaliy Makkoveev Subject: Re: tcp output MP fixes To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 6 Nov 2024 14:40:16 +0300 On Wed, Nov 06, 2024 at 11:50:03AM +0100, Alexander Bluhm wrote: > Hi, > > Next goal is running TCP socket layer with shared netlock. I went > through tcp_output() and looked for MP races. Mostly documentation > and small fixes. > > I am not sure about sbchecklowmem(). For MP safety we must serialize > calls to m_pool_used() and protect sblowmem with a mutex. In > practice not much can ever go wrong. I have chosen mtx_enter_try() > to update without spinning and using a possibly obsololete value > in case of lock contention. > > ok? > Could you keep locking description with variables definition? We could have many places where variable declared as extern, so each of them should have copy-pasted comment. Imagine you need to keep them in sync. > bluhm > > Index: kern/uipc_socket2.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v > diff -u -p -r1.158 uipc_socket2.c > --- kern/uipc_socket2.c 12 Jul 2024 19:50:35 -0000 1.158 > +++ kern/uipc_socket2.c 6 Nov 2024 10:42:27 -0000 > @@ -49,6 +49,9 @@ > > u_long sb_max = SB_MAX; /* patchable */ > > +struct mutex sblowmem_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); > +int sblowmem; > + > extern struct pool mclpools[]; > extern struct pool mbpool; > > @@ -683,15 +674,24 @@ sbcheckreserve(u_long cnt, u_long defcnt > int > sbchecklowmem(void) > { > - static int sblowmem; > - unsigned int used = m_pool_used(); > + int lowmem; > + unsigned int used; > > + if (mtx_enter_try(&sblowmem_mtx) == 0) { > + /* another thread is updating sblowmem, use old or new value */ > + return (sblowmem); > + } > + > + used = m_pool_used(); > if (used < 60) > sblowmem = 0; > else if (used > 80) > sblowmem = 1; > + lowmem = sblowmem; > + > + mtx_leave(&sblowmem_mtx); > > - return (sblowmem); > + return (lowmem); > } > > /* > Index: netinet/in_pcb.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v > diff -u -p -r1.159 in_pcb.h > --- netinet/in_pcb.h 5 Nov 2024 10:49:23 -0000 1.159 > +++ netinet/in_pcb.h 5 Nov 2024 23:18:13 -0000 > @@ -140,7 +140,7 @@ struct inpcb { > u_int16_t inp_lport; /* [t] local port */ > struct socket *inp_socket; /* [I] back pointer to socket */ > caddr_t inp_ppcb; /* pointer to per-protocol pcb */ > - struct route inp_route; /* cached route */ > + struct route inp_route; /* [s] cached route */ > struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ > int inp_flags; /* generic IP/datagram flags */ > union { /* Header prototype. */ > Index: netinet/ip_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v > diff -u -p -r1.401 ip_input.c > --- netinet/ip_input.c 6 Aug 2024 16:56:09 -0000 1.401 > +++ netinet/ip_input.c 5 Nov 2024 23:18:13 -0000 > @@ -83,23 +83,16 @@ > #include > #endif > > -/* > - * Locks used to protect global variables in this file: > - * I immutable after creation > - * a atomic operations > - * N net lock > - */ > - > /* values controllable via sysctl */ > -int ip_forwarding = 0; /* [a] */ > +int ip_forwarding = 0; > int ipmforwarding = 0; > int ipmultipath = 0; > -int ip_sendredirects = 1; /* [a] */ > +int ip_sendredirects = 1; > int ip_dosourceroute = 0; > int ip_defttl = IPDEFTTL; > int ip_mtudisc = 1; > int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; > -int ip_directedbcast = 0; /* [a] */ > +int ip_directedbcast = 0; > > /* Protects `ipq' and `ip_frags'. */ > struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET); > Index: netinet/ip_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v > diff -u -p -r1.120 ip_var.h > --- netinet/ip_var.h 12 Jul 2024 19:50:35 -0000 1.120 > +++ netinet/ip_var.h 5 Nov 2024 23:18:13 -0000 > @@ -36,6 +36,13 @@ > #define _NETINET_IP_VAR_H_ > > /* > + * Locks used to protect global variables in this file: > + * I immutable after creation > + * a atomic operations > + * N net lock > + */ > + > +/* > * Structure stored in mbuf in inpcb.ip_options > * and passed to ip_output when ip options are in use. > * The actual length of the options (including ipopt_dst) > @@ -216,19 +223,20 @@ extern int ip_defttl; /* default IP tt > > #define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */ > > -extern int ip_mtudisc; /* mtu discovery */ > +extern int ip_mtudisc; /* [N] mtu discovery */ > extern int ip_mtudisc_timeout; /* seconds to timeout mtu discovery */ > > extern int ipport_firstauto; /* min port for port allocation */ > extern int ipport_lastauto; /* max port for port allocation */ > extern int ipport_hifirstauto; /* min dynamic/private port number */ > extern int ipport_hilastauto; /* max dynamic/private port number */ > -extern int ip_forwarding; /* enable IP forwarding */ > +extern int ip_forwarding; /* [a] enable IP forwarding */ > #ifdef MROUTING > extern int ipmforwarding; /* enable multicast forwarding */ > #endif > extern int ipmultipath; /* enable multipath routing */ > -extern int ip_directedbcast; /* accept all broadcast packets */ > +extern int ip_sendredirects; /* [a] send icmp redirect while forwd */ > +extern int ip_directedbcast; /* [a] accept all broadcast packets */ > extern unsigned int la_hold_total; > > extern const struct pr_usrreqs rip_usrreqs; > Index: netinet/tcp_fsm.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_fsm.h,v > diff -u -p -r1.9 tcp_fsm.h > --- netinet/tcp_fsm.h 5 Feb 2018 14:53:26 -0000 1.9 > +++ netinet/tcp_fsm.h 5 Nov 2024 23:18:13 -0000 > @@ -68,7 +68,7 @@ > * determined by state, with the proviso that TH_FIN is sent only > * if all data queued for output is included in the segment. > */ > -u_char tcp_outflags[TCP_NSTATES] = { > +const u_char tcp_outflags[TCP_NSTATES] = { > TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK, > TH_ACK, TH_ACK, > TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK, > @@ -76,7 +76,7 @@ u_char tcp_outflags[TCP_NSTATES] = { > #endif /* TCPOUTFLAGS */ > > #ifdef TCPSTATES > -const char *tcpstates[] = { > +const char *const tcpstates[] = { > "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", > "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", > "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.407 tcp_input.c > --- netinet/tcp_input.c 26 Aug 2024 13:55:14 -0000 1.407 > +++ netinet/tcp_input.c 5 Nov 2024 23:18:13 -0000 > @@ -2788,11 +2788,11 @@ tcp_xmit_timer(struct tcpcb *tp, int32_t > int > tcp_mss(struct tcpcb *tp, int offer) > { > + struct inpcb *inp; > struct rtentry *rt; > struct ifnet *ifp = NULL; > - int mss, mssopt; > - int iphlen; > - struct inpcb *inp; > + int mss, mssopt, iphlen, do_rfc3390; > + u_int rtmtu; > > inp = tp->t_inpcb; > > @@ -2825,12 +2825,13 @@ tcp_mss(struct tcpcb *tp, int offer) > * if there's an mtu associated with the route and we support > * path MTU discovery for the underlying protocol family, use it. > */ > - if (rt->rt_mtu) { > + rtmtu = atomic_load_int(&rt->rt_mtu); > + if (rtmtu) { > /* > * One may wish to lower MSS to take into account options, > * especially security-related options. > */ > - if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { > + if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) { > /* > * RFC2460 section 5, last paragraph: if path MTU is > * smaller than 1280, use 1280 as packet size and > @@ -2839,8 +2840,7 @@ tcp_mss(struct tcpcb *tp, int offer) > mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - > sizeof(struct tcphdr); > } else { > - mss = rt->rt_mtu - iphlen - > - sizeof(struct tcphdr); > + mss = rtmtu - iphlen - sizeof(struct tcphdr); > } > } else if (ifp->if_flags & IFF_LOOPBACK) { > mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); > @@ -2900,6 +2900,7 @@ tcp_mss(struct tcpcb *tp, int offer) > mss -= TCPOLEN_SIGLEN; > #endif > > + do_rfc3390 = atomic_load_int(&tcp_do_rfc3390); > if (offer == -1) { > /* mss changed due to Path MTU discovery */ > tp->t_flags &= ~TF_PMTUD_PEND; > @@ -2914,10 +2915,10 @@ tcp_mss(struct tcpcb *tp, int offer) > tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * > mss, mss); > } > - } else if (tcp_do_rfc3390 == 2) { > + } else if (do_rfc3390 == 2) { > /* increase initial window */ > tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); > - } else if (tcp_do_rfc3390) { > + } else if (do_rfc3390) { > /* increase initial window */ > tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); > } else > Index: netinet/tcp_output.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v > diff -u -p -r1.145 tcp_output.c > --- netinet/tcp_output.c 14 May 2024 09:39:02 -0000 1.145 > +++ netinet/tcp_output.c 5 Nov 2024 23:18:13 -0000 > @@ -103,8 +103,6 @@ > extern struct mbuf *m_copypack(); > #endif > > -extern int tcprexmtthresh; > - > #ifdef TCP_SACK_DEBUG > void tcp_print_holes(struct tcpcb *tp); > > @@ -350,7 +348,7 @@ again: > txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg); > > if (len > txmaxseg) { > - if (tcp_do_tso && > + if (atomic_load_int(&tcp_do_tso) && > tp->t_inpcb->inp_options == NULL && > tp->t_inpcb->inp_outputopts6 == NULL && > #ifdef TCP_SIGNATURE > Index: netinet/tcp_timer.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v > diff -u -p -r1.76 tcp_timer.c > --- netinet/tcp_timer.c 28 Jan 2024 20:34:25 -0000 1.76 > +++ netinet/tcp_timer.c 5 Nov 2024 23:18:13 -0000 > @@ -167,10 +167,10 @@ tcp_canceltimers(struct tcpcb *tp) > TCP_TIMER_DISARM(tp, i); > } > > -int tcp_backoff[TCP_MAXRXTSHIFT + 1] = > +const int tcp_backoff[TCP_MAXRXTSHIFT + 1] = > { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; > > -int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ > +const int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ > > /* > * TCP timer processing. > Index: netinet/tcp_timer.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v > diff -u -p -r1.21 tcp_timer.h > --- netinet/tcp_timer.h 29 Jan 2024 22:47:13 -0000 1.21 > +++ netinet/tcp_timer.h 5 Nov 2024 23:18:13 -0000 > @@ -154,13 +154,12 @@ typedef void (*tcp_timer_func_t)(void *) > extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS]; > > extern int tcp_delack_msecs; /* delayed ACK timeout in millisecs */ > -extern int tcptv_keep_init; > extern int tcp_always_keepalive; /* assume SO_KEEPALIVE is always set */ > extern int tcp_keepidle; /* time before keepalive probes begin */ > extern int tcp_keepintvl; /* time between keepalive probes */ > extern int tcp_maxidle; /* time to drop after starting probes */ > extern int tcp_ttl; /* time to live for TCP segs */ > -extern int tcp_backoff[]; > +extern const int tcp_backoff[]; > > void tcp_timer_init(void); > #endif /* _KERNEL */ > Index: netinet/tcp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v > diff -u -p -r1.231 tcp_usrreq.c > --- netinet/tcp_usrreq.c 12 Apr 2024 16:07:09 -0000 1.231 > +++ netinet/tcp_usrreq.c 5 Nov 2024 23:18:13 -0000 > @@ -105,12 +105,12 @@ > #ifndef TCP_SENDSPACE > #define TCP_SENDSPACE 1024*16 > #endif > -u_int tcp_sendspace = TCP_SENDSPACE; > +u_int tcp_sendspace = TCP_SENDSPACE; /* [I] */ > #ifndef TCP_RECVSPACE > #define TCP_RECVSPACE 1024*16 > #endif > -u_int tcp_recvspace = TCP_RECVSPACE; > -u_int tcp_autorcvbuf_inc = 16 * 1024; > +u_int tcp_recvspace = TCP_RECVSPACE; /* [I] */ > +u_int tcp_autorcvbuf_inc = 16 * 1024; /* [I] */ > > const struct pr_usrreqs tcp_usrreqs = { > .pru_attach = tcp_attach, > @@ -1516,13 +1516,14 @@ tcp_update_sndspace(struct tcpcb *tp) > /* low on memory try to get rid of some */ > if (tcp_sendspace < nmax) > nmax = tcp_sendspace; > - } else if (so->so_snd.sb_wat != tcp_sendspace) > + } else if (so->so_snd.sb_wat != tcp_sendspace) { > /* user requested buffer size, auto-scaling disabled */ > nmax = so->so_snd.sb_wat; > - else > + } else { > /* automatic buffer scaling */ > nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - > tp->snd_una); > + } > > /* a writable socket must be preserved because of poll(2) semantics */ > if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { > @@ -1556,10 +1557,10 @@ tcp_update_rcvspace(struct tcpcb *tp) > /* low on memory try to get rid of some */ > if (tcp_recvspace < nmax) > nmax = tcp_recvspace; > - } else if (so->so_rcv.sb_wat != tcp_recvspace) > + } else if (so->so_rcv.sb_wat != tcp_recvspace) { > /* user requested buffer size, auto-scaling disabled */ > nmax = so->so_rcv.sb_wat; > - else { > + } else { > /* automatic buffer scaling */ > if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) > nmax = MIN(sb_max, so->so_rcv.sb_hiwat + > Index: netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > diff -u -p -r1.178 tcp_var.h > --- netinet/tcp_var.h 13 May 2024 01:15:53 -0000 1.178 > +++ netinet/tcp_var.h 5 Nov 2024 23:18:13 -0000 > @@ -228,6 +228,7 @@ struct tcp_opt_info { > /* > * Locks used to protect global data and struct members: > * I immutable after creation > + * a atomic operations > * N net lock > * S syn_cache_mtx tcp syn cache global mutex > */ > @@ -678,16 +679,17 @@ extern const struct pr_usrreqs tcp6_usrr > extern struct pool tcpcb_pool; > extern struct inpcbtable tcbtable, tcb6table; /* queue of active tcpcb's */ > extern int tcp_do_rfc1323; /* enabled/disabled? */ > +extern int tcprexmtthresh; /* [I] */ > extern int tcptv_keep_init; /* [N] time to keep alive initial SYN packet */ > -extern int tcp_mssdflt; /* default maximum segment size */ > +extern int tcp_mssdflt; /* [N] default maximum segment size */ > extern int tcp_rst_ppslim; /* maximum outgoing RST packet per second */ > extern int tcp_ack_on_push; /* ACK immediately on PUSH */ > extern int tcp_do_sack; /* SACK enabled/disabled */ > extern struct pool sackhl_pool; > extern int tcp_sackhole_limit; /* max entries for tcp sack queues */ > -extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */ > -extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial Window */ > -extern int tcp_do_tso; /* enable TSO for TCP output packets */ > +extern int tcp_do_ecn; /* [N] RFC3168 ECN enabled/disabled? */ > +extern int tcp_do_rfc3390; /* [a] RFC3390 Increasing TCP Initial Window */ > +extern int tcp_do_tso; /* [a] enable TSO for TCP output packets */ > > extern struct pool tcpqe_pool; > extern int tcp_reass_limit; /* max entries for tcp reass queues */ > Index: sys/mbuf.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v > diff -u -p -r1.265 mbuf.h > --- sys/mbuf.h 5 Nov 2024 13:15:13 -0000 1.265 > +++ sys/mbuf.h 5 Nov 2024 23:18:13 -0000 > @@ -411,7 +411,7 @@ struct mbuf_queue { > struct pool; > > extern long nmbclust; /* limit on the # of clusters */ > -extern int max_linkhdr; /* largest link-level header */ > +extern int max_linkhdr; /* [I] largest link-level header */ > extern int max_protohdr; /* largest protocol header */ > extern int max_hdr; /* largest link+protocol header */ > extern struct cpumem *mbstat; /* mbuf statistics counter */ > Index: sys/socketvar.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v > diff -u -p -r1.134 socketvar.h > --- sys/socketvar.h 9 Sep 2024 07:38:45 -0000 1.134 > +++ sys/socketvar.h 5 Nov 2024 23:18:13 -0000 > @@ -376,7 +376,7 @@ sbassertlocked(struct sockbuf *sb) > } \ > } while (/*CONSTCOND*/0) > > -extern u_long sb_max; > +extern u_long sb_max; /* [I] */ > > extern struct pool socket_pool; > >