Download raw body.
tcp output MP fixes
On Wed, Nov 06, 2024 at 11:50:03AM +0100, Alexander Bluhm wrote:
> Hi,
>
> Next goal is running TCP socket layer with shared netlock. I went
> through tcp_output() and looked for MP races. Mostly documentation
> and small fixes.
>
> I am not sure about sbchecklowmem(). For MP safety we must serialize
> calls to m_pool_used() and protect sblowmem with a mutex. In
> practice not much can ever go wrong. I have chosen mtx_enter_try()
> to update without spinning and using a possibly obsololete value
> in case of lock contention.
>
> ok?
>
Could you keep locking description with variables definition? We could
have many places where variable declared as extern, so each of them
should have copy-pasted comment. Imagine you need to keep them in sync.
> bluhm
>
> Index: kern/uipc_socket2.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
> diff -u -p -r1.158 uipc_socket2.c
> --- kern/uipc_socket2.c 12 Jul 2024 19:50:35 -0000 1.158
> +++ kern/uipc_socket2.c 6 Nov 2024 10:42:27 -0000
> @@ -49,6 +49,9 @@
>
> u_long sb_max = SB_MAX; /* patchable */
>
> +struct mutex sblowmem_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
> +int sblowmem;
> +
> extern struct pool mclpools[];
> extern struct pool mbpool;
>
> @@ -683,15 +674,24 @@ sbcheckreserve(u_long cnt, u_long defcnt
> int
> sbchecklowmem(void)
> {
> - static int sblowmem;
> - unsigned int used = m_pool_used();
> + int lowmem;
> + unsigned int used;
>
> + if (mtx_enter_try(&sblowmem_mtx) == 0) {
> + /* another thread is updating sblowmem, use old or new value */
> + return (sblowmem);
> + }
> +
> + used = m_pool_used();
> if (used < 60)
> sblowmem = 0;
> else if (used > 80)
> sblowmem = 1;
> + lowmem = sblowmem;
> +
> + mtx_leave(&sblowmem_mtx);
>
> - return (sblowmem);
> + return (lowmem);
> }
>
> /*
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> diff -u -p -r1.159 in_pcb.h
> --- netinet/in_pcb.h 5 Nov 2024 10:49:23 -0000 1.159
> +++ netinet/in_pcb.h 5 Nov 2024 23:18:13 -0000
> @@ -140,7 +140,7 @@ struct inpcb {
> u_int16_t inp_lport; /* [t] local port */
> struct socket *inp_socket; /* [I] back pointer to socket */
> caddr_t inp_ppcb; /* pointer to per-protocol pcb */
> - struct route inp_route; /* cached route */
> + struct route inp_route; /* [s] cached route */
> struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
> int inp_flags; /* generic IP/datagram flags */
> union { /* Header prototype. */
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> diff -u -p -r1.401 ip_input.c
> --- netinet/ip_input.c 6 Aug 2024 16:56:09 -0000 1.401
> +++ netinet/ip_input.c 5 Nov 2024 23:18:13 -0000
> @@ -83,23 +83,16 @@
> #include <netinet/ip_carp.h>
> #endif
>
> -/*
> - * Locks used to protect global variables in this file:
> - * I immutable after creation
> - * a atomic operations
> - * N net lock
> - */
> -
> /* values controllable via sysctl */
> -int ip_forwarding = 0; /* [a] */
> +int ip_forwarding = 0;
> int ipmforwarding = 0;
> int ipmultipath = 0;
> -int ip_sendredirects = 1; /* [a] */
> +int ip_sendredirects = 1;
> int ip_dosourceroute = 0;
> int ip_defttl = IPDEFTTL;
> int ip_mtudisc = 1;
> int ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
> -int ip_directedbcast = 0; /* [a] */
> +int ip_directedbcast = 0;
>
> /* Protects `ipq' and `ip_frags'. */
> struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET);
> Index: netinet/ip_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> diff -u -p -r1.120 ip_var.h
> --- netinet/ip_var.h 12 Jul 2024 19:50:35 -0000 1.120
> +++ netinet/ip_var.h 5 Nov 2024 23:18:13 -0000
> @@ -36,6 +36,13 @@
> #define _NETINET_IP_VAR_H_
>
> /*
> + * Locks used to protect global variables in this file:
> + * I immutable after creation
> + * a atomic operations
> + * N net lock
> + */
> +
> +/*
> * Structure stored in mbuf in inpcb.ip_options
> * and passed to ip_output when ip options are in use.
> * The actual length of the options (including ipopt_dst)
> @@ -216,19 +223,20 @@ extern int ip_defttl; /* default IP tt
>
> #define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */
>
> -extern int ip_mtudisc; /* mtu discovery */
> +extern int ip_mtudisc; /* [N] mtu discovery */
> extern int ip_mtudisc_timeout; /* seconds to timeout mtu discovery */
>
> extern int ipport_firstauto; /* min port for port allocation */
> extern int ipport_lastauto; /* max port for port allocation */
> extern int ipport_hifirstauto; /* min dynamic/private port number */
> extern int ipport_hilastauto; /* max dynamic/private port number */
> -extern int ip_forwarding; /* enable IP forwarding */
> +extern int ip_forwarding; /* [a] enable IP forwarding */
> #ifdef MROUTING
> extern int ipmforwarding; /* enable multicast forwarding */
> #endif
> extern int ipmultipath; /* enable multipath routing */
> -extern int ip_directedbcast; /* accept all broadcast packets */
> +extern int ip_sendredirects; /* [a] send icmp redirect while forwd */
> +extern int ip_directedbcast; /* [a] accept all broadcast packets */
> extern unsigned int la_hold_total;
>
> extern const struct pr_usrreqs rip_usrreqs;
> Index: netinet/tcp_fsm.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_fsm.h,v
> diff -u -p -r1.9 tcp_fsm.h
> --- netinet/tcp_fsm.h 5 Feb 2018 14:53:26 -0000 1.9
> +++ netinet/tcp_fsm.h 5 Nov 2024 23:18:13 -0000
> @@ -68,7 +68,7 @@
> * determined by state, with the proviso that TH_FIN is sent only
> * if all data queued for output is included in the segment.
> */
> -u_char tcp_outflags[TCP_NSTATES] = {
> +const u_char tcp_outflags[TCP_NSTATES] = {
> TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
> TH_ACK, TH_ACK,
> TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK,
> @@ -76,7 +76,7 @@ u_char tcp_outflags[TCP_NSTATES] = {
> #endif /* TCPOUTFLAGS */
>
> #ifdef TCPSTATES
> -const char *tcpstates[] = {
> +const char *const tcpstates[] = {
> "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD",
> "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
> "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.407 tcp_input.c
> --- netinet/tcp_input.c 26 Aug 2024 13:55:14 -0000 1.407
> +++ netinet/tcp_input.c 5 Nov 2024 23:18:13 -0000
> @@ -2788,11 +2788,11 @@ tcp_xmit_timer(struct tcpcb *tp, int32_t
> int
> tcp_mss(struct tcpcb *tp, int offer)
> {
> + struct inpcb *inp;
> struct rtentry *rt;
> struct ifnet *ifp = NULL;
> - int mss, mssopt;
> - int iphlen;
> - struct inpcb *inp;
> + int mss, mssopt, iphlen, do_rfc3390;
> + u_int rtmtu;
>
> inp = tp->t_inpcb;
>
> @@ -2825,12 +2825,13 @@ tcp_mss(struct tcpcb *tp, int offer)
> * if there's an mtu associated with the route and we support
> * path MTU discovery for the underlying protocol family, use it.
> */
> - if (rt->rt_mtu) {
> + rtmtu = atomic_load_int(&rt->rt_mtu);
> + if (rtmtu) {
> /*
> * One may wish to lower MSS to take into account options,
> * especially security-related options.
> */
> - if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
> + if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) {
> /*
> * RFC2460 section 5, last paragraph: if path MTU is
> * smaller than 1280, use 1280 as packet size and
> @@ -2839,8 +2840,7 @@ tcp_mss(struct tcpcb *tp, int offer)
> mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
> sizeof(struct tcphdr);
> } else {
> - mss = rt->rt_mtu - iphlen -
> - sizeof(struct tcphdr);
> + mss = rtmtu - iphlen - sizeof(struct tcphdr);
> }
> } else if (ifp->if_flags & IFF_LOOPBACK) {
> mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
> @@ -2900,6 +2900,7 @@ tcp_mss(struct tcpcb *tp, int offer)
> mss -= TCPOLEN_SIGLEN;
> #endif
>
> + do_rfc3390 = atomic_load_int(&tcp_do_rfc3390);
> if (offer == -1) {
> /* mss changed due to Path MTU discovery */
> tp->t_flags &= ~TF_PMTUD_PEND;
> @@ -2914,10 +2915,10 @@ tcp_mss(struct tcpcb *tp, int offer)
> tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
> mss, mss);
> }
> - } else if (tcp_do_rfc3390 == 2) {
> + } else if (do_rfc3390 == 2) {
> /* increase initial window */
> tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
> - } else if (tcp_do_rfc3390) {
> + } else if (do_rfc3390) {
> /* increase initial window */
> tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
> } else
> Index: netinet/tcp_output.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
> diff -u -p -r1.145 tcp_output.c
> --- netinet/tcp_output.c 14 May 2024 09:39:02 -0000 1.145
> +++ netinet/tcp_output.c 5 Nov 2024 23:18:13 -0000
> @@ -103,8 +103,6 @@
> extern struct mbuf *m_copypack();
> #endif
>
> -extern int tcprexmtthresh;
> -
> #ifdef TCP_SACK_DEBUG
> void tcp_print_holes(struct tcpcb *tp);
>
> @@ -350,7 +348,7 @@ again:
> txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
>
> if (len > txmaxseg) {
> - if (tcp_do_tso &&
> + if (atomic_load_int(&tcp_do_tso) &&
> tp->t_inpcb->inp_options == NULL &&
> tp->t_inpcb->inp_outputopts6 == NULL &&
> #ifdef TCP_SIGNATURE
> Index: netinet/tcp_timer.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
> diff -u -p -r1.76 tcp_timer.c
> --- netinet/tcp_timer.c 28 Jan 2024 20:34:25 -0000 1.76
> +++ netinet/tcp_timer.c 5 Nov 2024 23:18:13 -0000
> @@ -167,10 +167,10 @@ tcp_canceltimers(struct tcpcb *tp)
> TCP_TIMER_DISARM(tp, i);
> }
>
> -int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
> +const int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
> { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
>
> -int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
> +const int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
>
> /*
> * TCP timer processing.
> Index: netinet/tcp_timer.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
> diff -u -p -r1.21 tcp_timer.h
> --- netinet/tcp_timer.h 29 Jan 2024 22:47:13 -0000 1.21
> +++ netinet/tcp_timer.h 5 Nov 2024 23:18:13 -0000
> @@ -154,13 +154,12 @@ typedef void (*tcp_timer_func_t)(void *)
> extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
>
> extern int tcp_delack_msecs; /* delayed ACK timeout in millisecs */
> -extern int tcptv_keep_init;
> extern int tcp_always_keepalive; /* assume SO_KEEPALIVE is always set */
> extern int tcp_keepidle; /* time before keepalive probes begin */
> extern int tcp_keepintvl; /* time between keepalive probes */
> extern int tcp_maxidle; /* time to drop after starting probes */
> extern int tcp_ttl; /* time to live for TCP segs */
> -extern int tcp_backoff[];
> +extern const int tcp_backoff[];
>
> void tcp_timer_init(void);
> #endif /* _KERNEL */
> Index: netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
> diff -u -p -r1.231 tcp_usrreq.c
> --- netinet/tcp_usrreq.c 12 Apr 2024 16:07:09 -0000 1.231
> +++ netinet/tcp_usrreq.c 5 Nov 2024 23:18:13 -0000
> @@ -105,12 +105,12 @@
> #ifndef TCP_SENDSPACE
> #define TCP_SENDSPACE 1024*16
> #endif
> -u_int tcp_sendspace = TCP_SENDSPACE;
> +u_int tcp_sendspace = TCP_SENDSPACE; /* [I] */
> #ifndef TCP_RECVSPACE
> #define TCP_RECVSPACE 1024*16
> #endif
> -u_int tcp_recvspace = TCP_RECVSPACE;
> -u_int tcp_autorcvbuf_inc = 16 * 1024;
> +u_int tcp_recvspace = TCP_RECVSPACE; /* [I] */
> +u_int tcp_autorcvbuf_inc = 16 * 1024; /* [I] */
>
> const struct pr_usrreqs tcp_usrreqs = {
> .pru_attach = tcp_attach,
> @@ -1516,13 +1516,14 @@ tcp_update_sndspace(struct tcpcb *tp)
> /* low on memory try to get rid of some */
> if (tcp_sendspace < nmax)
> nmax = tcp_sendspace;
> - } else if (so->so_snd.sb_wat != tcp_sendspace)
> + } else if (so->so_snd.sb_wat != tcp_sendspace) {
> /* user requested buffer size, auto-scaling disabled */
> nmax = so->so_snd.sb_wat;
> - else
> + } else {
> /* automatic buffer scaling */
> nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
> tp->snd_una);
> + }
>
> /* a writable socket must be preserved because of poll(2) semantics */
> if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
> @@ -1556,10 +1557,10 @@ tcp_update_rcvspace(struct tcpcb *tp)
> /* low on memory try to get rid of some */
> if (tcp_recvspace < nmax)
> nmax = tcp_recvspace;
> - } else if (so->so_rcv.sb_wat != tcp_recvspace)
> + } else if (so->so_rcv.sb_wat != tcp_recvspace) {
> /* user requested buffer size, auto-scaling disabled */
> nmax = so->so_rcv.sb_wat;
> - else {
> + } else {
> /* automatic buffer scaling */
> if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
> nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
> Index: netinet/tcp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> diff -u -p -r1.178 tcp_var.h
> --- netinet/tcp_var.h 13 May 2024 01:15:53 -0000 1.178
> +++ netinet/tcp_var.h 5 Nov 2024 23:18:13 -0000
> @@ -228,6 +228,7 @@ struct tcp_opt_info {
> /*
> * Locks used to protect global data and struct members:
> * I immutable after creation
> + * a atomic operations
> * N net lock
> * S syn_cache_mtx tcp syn cache global mutex
> */
> @@ -678,16 +679,17 @@ extern const struct pr_usrreqs tcp6_usrr
> extern struct pool tcpcb_pool;
> extern struct inpcbtable tcbtable, tcb6table; /* queue of active tcpcb's */
> extern int tcp_do_rfc1323; /* enabled/disabled? */
> +extern int tcprexmtthresh; /* [I] */
> extern int tcptv_keep_init; /* [N] time to keep alive initial SYN packet */
> -extern int tcp_mssdflt; /* default maximum segment size */
> +extern int tcp_mssdflt; /* [N] default maximum segment size */
> extern int tcp_rst_ppslim; /* maximum outgoing RST packet per second */
> extern int tcp_ack_on_push; /* ACK immediately on PUSH */
> extern int tcp_do_sack; /* SACK enabled/disabled */
> extern struct pool sackhl_pool;
> extern int tcp_sackhole_limit; /* max entries for tcp sack queues */
> -extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */
> -extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial Window */
> -extern int tcp_do_tso; /* enable TSO for TCP output packets */
> +extern int tcp_do_ecn; /* [N] RFC3168 ECN enabled/disabled? */
> +extern int tcp_do_rfc3390; /* [a] RFC3390 Increasing TCP Initial Window */
> +extern int tcp_do_tso; /* [a] enable TSO for TCP output packets */
>
> extern struct pool tcpqe_pool;
> extern int tcp_reass_limit; /* max entries for tcp reass queues */
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.265 mbuf.h
> --- sys/mbuf.h 5 Nov 2024 13:15:13 -0000 1.265
> +++ sys/mbuf.h 5 Nov 2024 23:18:13 -0000
> @@ -411,7 +411,7 @@ struct mbuf_queue {
> struct pool;
>
> extern long nmbclust; /* limit on the # of clusters */
> -extern int max_linkhdr; /* largest link-level header */
> +extern int max_linkhdr; /* [I] largest link-level header */
> extern int max_protohdr; /* largest protocol header */
> extern int max_hdr; /* largest link+protocol header */
> extern struct cpumem *mbstat; /* mbuf statistics counter */
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> diff -u -p -r1.134 socketvar.h
> --- sys/socketvar.h 9 Sep 2024 07:38:45 -0000 1.134
> +++ sys/socketvar.h 5 Nov 2024 23:18:13 -0000
> @@ -376,7 +376,7 @@ sbassertlocked(struct sockbuf *sb)
> } \
> } while (/*CONSTCOND*/0)
>
> -extern u_long sb_max;
> +extern u_long sb_max; /* [I] */
>
> extern struct pool socket_pool;
>
>
tcp output MP fixes