From: Vitaliy Makkoveev Subject: Re: tcp(4): use per-sockbuf mutex to protect `so_snd' socket buffer To: Alexander Bluhm Cc: OpenBSD tech Date: Fri, 27 Dec 2024 14:44:46 +0300 On Thu, Dec 26, 2024 at 06:43:29PM +0300, Vitaliy Makkoveev wrote: > > On 26 Dec 2024, at 15:47, Alexander Bluhm wrote: > > > > On Tue, Dec 24, 2024 at 10:21:55PM +0300, Vitaliy Makkoveev wrote: > >> This is the updated diff. It uses barriers instead of `ssp_idleto' > >> timeout and `ssp_task' task redefinition. > > > > I have tested this diff with my regression, performance, and network > > setup. Works fine. > > > > OK bluhm@ > > > > Could you wait one day before commiting this? Then we have a > > snapshot between my and your unlocking diff. If something goes > > wrong, it will be easier to bisect. > > Thanks for testing! Will commit this weekend or Monday. > The updated diff. It doesn't contain already committed tcp_mss_update() and unsplicing hunks. Note, in sofree() I intentionally left "so->so_sp" check within "if (!keep-lock)" block. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.349 uipc_socket.c --- sys/kern/uipc_socket.c 27 Dec 2024 10:18:04 -0000 1.349 +++ sys/kern/uipc_socket.c 27 Dec 2024 11:36:04 -0000 @@ -151,27 +151,8 @@ soalloc(const struct protosw *prp, int w TAILQ_INIT(&so->so_q0); TAILQ_INIT(&so->so_q); - switch (dp->dom_family) { - case AF_INET: - case AF_INET6: - switch (prp->pr_type) { - case SOCK_RAW: - case SOCK_DGRAM: - so->so_snd.sb_flags |= SB_MTXLOCK; - /* FALLTHROUGH */ - case SOCK_STREAM: - so->so_rcv.sb_flags |= SB_MTXLOCK; - break; - } - break; - case AF_KEY: - case AF_ROUTE: - case AF_UNIX: - case AF_FRAME: - so->so_snd.sb_flags |= SB_MTXLOCK; - so->so_rcv.sb_flags |= SB_MTXLOCK; - break; - } + so->so_snd.sb_flags |= SB_MTXLOCK; + so->so_rcv.sb_flags |= SB_MTXLOCK; return (so); } @@ -332,7 +313,6 @@ sofree(struct socket *so, int keep_lock) */ sounlock(so); refcnt_finalize(&so->so_refcnt, "sofinal"); - solock(so); } sigio_free(&so->so_sigio); @@ -354,8 +334,6 @@ sofree(struct socket *so, int keep_lock) m_purge(so->so_rcv.sb_mb); if (!keep_lock) { - sounlock(so); - #ifdef SOCKET_SPLICE if (so->so_sp) { timeout_del_barrier(&so->so_sp->ssp_idleto); Index: sys/netinet/tcp_input.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_input.c,v diff -u -p -r1.413 tcp_input.c --- sys/netinet/tcp_input.c 26 Dec 2024 12:16:17 -0000 1.413 +++ sys/netinet/tcp_input.c 27 Dec 2024 11:36:04 -0000 @@ -957,7 +957,10 @@ findpcb: acked); tp->t_rcvacktime = now; ND6_HINT(tp); + + mtx_enter(&so->so_snd.sb_mtx); sbdrop(so, &so->so_snd, acked); + mtx_leave(&so->so_snd.sb_mtx); /* * If we had a pending ICMP message that @@ -1738,10 +1741,14 @@ trimthenstep6: tp->snd_wnd -= so->so_snd.sb_cc; else tp->snd_wnd = 0; + mtx_enter(&so->so_snd.sb_mtx); sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); + mtx_leave(&so->so_snd.sb_mtx); ourfinisacked = 1; } else { + mtx_enter(&so->so_snd.sb_mtx); sbdrop(so, &so->so_snd, acked); + mtx_leave(&so->so_snd.sb_mtx); if (tp->snd_wnd > acked) tp->snd_wnd -= acked; else Index: sys/netinet/tcp_output.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_output.c,v diff -u -p -r1.147 tcp_output.c --- sys/netinet/tcp_output.c 26 Dec 2024 12:16:17 -0000 1.147 +++ sys/netinet/tcp_output.c 27 Dec 2024 11:36:04 -0000 @@ -200,7 +200,7 @@ tcp_output(struct tcpcb *tp) u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))]; u_char *opt = (u_char *)optbuf; unsigned int optlen, hdrlen, packetlen; - int idle, sendalot = 0; + int doing_sosend, idle, sendalot = 0; int i, sack_rxmit = 0; struct sackhole *p; uint64_t now; @@ -225,6 +225,10 @@ tcp_output(struct tcpcb *tp) now = tcp_now(); + mtx_enter(&so->so_snd.sb_mtx); + doing_sosend=soissending(so); + mtx_leave(&so->so_snd.sb_mtx); + /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -241,7 +245,7 @@ tcp_output(struct tcpcb *tp) tp->snd_cwnd = 2 * tp->t_maxseg; /* remember 'idle' for next invocation of tcp_output */ - if (idle && soissending(so)) { + if (idle && doing_sosend) { tp->t_flags |= TF_LASTIDLE; idle = 0; } else @@ -390,7 +394,7 @@ again: if (len >= txmaxseg) goto send; if ((idle || (tp->t_flags & TF_NODELAY)) && - len + off >= so->so_snd.sb_cc && !soissending(so) && + len + off >= so->so_snd.sb_cc && !doing_sosend && (tp->t_flags & TF_NOPUSH) == 0) goto send; if (tp->t_force) @@ -723,7 +727,7 @@ send: * give data to the user when a buffer fills or * a PUSH comes in.) */ - if (off + len == so->so_snd.sb_cc && !soissending(so)) + if (off + len == so->so_snd.sb_cc && !doing_sosend) flags |= TH_PUSH; tp->t_sndtime = now; } else { Index: sys/netinet/tcp_usrreq.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v diff -u -p -r1.233 tcp_usrreq.c --- sys/netinet/tcp_usrreq.c 19 Dec 2024 22:11:35 -0000 1.233 +++ sys/netinet/tcp_usrreq.c 27 Dec 2024 11:36:04 -0000 @@ -302,10 +302,12 @@ tcp_fill_info(struct tcpcb *tp, struct s ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; mtx_leave(&so->so_rcv.sb_mtx); + mtx_enter(&so->so_snd.sb_mtx); ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; + mtx_leave(&so->so_snd.sb_mtx); return 0; } @@ -842,7 +844,9 @@ tcp_send(struct socket *so, struct mbuf if (so->so_options & SO_DEBUG) ostate = tp->t_state; + mtx_enter(&so->so_snd.sb_mtx); sbappendstream(so, &so->so_snd, m); + mtx_leave(&so->so_snd.sb_mtx); m = NULL; error = tcp_output(tp); @@ -895,7 +899,9 @@ tcp_sense(struct socket *so, struct stat if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); + mtx_enter(&so->so_snd.sb_mtx); ub->st_blksize = so->so_snd.sb_hiwat; + mtx_leave(&so->so_snd.sb_mtx); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); @@ -970,7 +976,9 @@ tcp_sendoob(struct socket *so, struct mb * of data past the urgent section. * Otherwise, snd_up should be one lower. */ + mtx_enter(&so->so_snd.sb_mtx); sbappendstream(so, &so->so_snd, m); + mtx_leave(&so->so_snd.sb_mtx); m = NULL; tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; @@ -1519,7 +1527,11 @@ void tcp_update_sndspace(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; - u_long nmax = so->so_snd.sb_hiwat; + u_long nmax; + + mtx_enter(&so->so_snd.sb_mtx); + + nmax = so->so_snd.sb_hiwat; if (sbchecklowmem()) { /* low on memory try to get rid of some */ @@ -1535,7 +1547,7 @@ tcp_update_sndspace(struct tcpcb *tp) } /* a writable socket must be preserved because of poll(2) semantics */ - if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { + if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) { if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; /* keep in sync with sbreserve() calculation */ @@ -1548,6 +1560,8 @@ tcp_update_sndspace(struct tcpcb *tp) if (nmax != so->so_snd.sb_hiwat) sbreserve(so, &so->so_snd, nmax); + + mtx_leave(&so->so_snd.sb_mtx); } /* @@ -1581,9 +1595,11 @@ tcp_update_rcvspace(struct tcpcb *tp) } /* a readable socket must be preserved because of poll(2) semantics */ + mtx_enter(&so->so_snd.sb_mtx); if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && nmax < so->so_snd.sb_lowat) nmax = so->so_snd.sb_lowat; + mtx_leave(&so->so_snd.sb_mtx); if (nmax != so->so_rcv.sb_hiwat) { /* round to MSS boundary */