From: Alexander Bluhm Subject: Re: tcp(4): use per-sockbuf mutex to protect `so_snd' socket buffer To: Vitaliy Makkoveev Cc: OpenBSD tech Date: Tue, 31 Dec 2024 12:55:15 +0100 On Mon, Dec 30, 2024 at 11:37:12PM +0300, Vitaliy Makkoveev wrote: > Updated against latest sources. Passed all my tests. > @@ -225,6 +225,10 @@ tcp_output(struct tcpcb *tp) > > now = tcp_now(); > > + mtx_enter(&so->so_snd.sb_mtx); > + doing_sosend=soissending(so); > + mtx_leave(&so->so_snd.sb_mtx); > + > /* > * Determine length of data that should be transmitted, > * and flags that will be used. Could you put spaces around = OK bluhm@ > Index: sys/kern/uipc_socket.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket.c,v > diff -u -p -r1.351 uipc_socket.c > --- sys/kern/uipc_socket.c 30 Dec 2024 12:12:35 -0000 1.351 > +++ sys/kern/uipc_socket.c 30 Dec 2024 13:08:30 -0000 > @@ -151,27 +151,8 @@ soalloc(const struct protosw *prp, int w > TAILQ_INIT(&so->so_q0); > TAILQ_INIT(&so->so_q); > > - switch (dp->dom_family) { > - case AF_INET: > - case AF_INET6: > - switch (prp->pr_type) { > - case SOCK_RAW: > - case SOCK_DGRAM: > - so->so_snd.sb_flags |= SB_MTXLOCK; > - /* FALLTHROUGH */ > - case SOCK_STREAM: > - so->so_rcv.sb_flags |= SB_MTXLOCK; > - break; > - } > - break; > - case AF_KEY: > - case AF_ROUTE: > - case AF_UNIX: > - case AF_FRAME: > - so->so_snd.sb_flags |= SB_MTXLOCK; > - so->so_rcv.sb_flags |= SB_MTXLOCK; > - break; > - } > + so->so_snd.sb_flags |= SB_MTXLOCK; > + so->so_rcv.sb_flags |= SB_MTXLOCK; > > return (so); > } > @@ -271,9 +252,6 @@ solisten(struct socket *so, int backlog) > void > sorele(struct socket *so, int keep_lock) > { > - int need_lock = (((so->so_snd.sb_flags & SB_MTXLOCK) == 0) && > - keep_lock == 0); > - > if (keep_lock == 0) > sounlock(so); > > @@ -284,13 +262,9 @@ sorele(struct socket *so, int keep_lock) > klist_free(&so->so_rcv.sb_klist); > klist_free(&so->so_snd.sb_klist); > > - if (need_lock) > - solock(so); > mtx_enter(&so->so_snd.sb_mtx); > sbrelease(so, &so->so_snd); > mtx_leave(&so->so_snd.sb_mtx); > - if (need_lock) > - sounlock(so); > > if (so->so_proto->pr_flags & PR_RIGHTS && > so->so_proto->pr_domain->dom_dispose) > Index: sys/netinet/tcp_input.c > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.415 tcp_input.c > --- sys/netinet/tcp_input.c 30 Dec 2024 12:20:39 -0000 1.415 > +++ sys/netinet/tcp_input.c 30 Dec 2024 13:08:30 -0000 > @@ -957,7 +957,10 @@ findpcb: > acked); > tp->t_rcvacktime = now; > ND6_HINT(tp); > + > + mtx_enter(&so->so_snd.sb_mtx); > sbdrop(so, &so->so_snd, acked); > + mtx_leave(&so->so_snd.sb_mtx); > > /* > * If we had a pending ICMP message that > @@ -1738,10 +1741,14 @@ trimthenstep6: > tp->snd_wnd -= so->so_snd.sb_cc; > else > tp->snd_wnd = 0; > + mtx_enter(&so->so_snd.sb_mtx); > sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); > + mtx_leave(&so->so_snd.sb_mtx); > ourfinisacked = 1; > } else { > + mtx_enter(&so->so_snd.sb_mtx); > sbdrop(so, &so->so_snd, acked); > + mtx_leave(&so->so_snd.sb_mtx); > if (tp->snd_wnd > acked) > tp->snd_wnd -= acked; > else > Index: sys/netinet/tcp_output.c > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_output.c,v > diff -u -p -r1.148 tcp_output.c > --- sys/netinet/tcp_output.c 28 Dec 2024 22:17:09 -0000 1.148 > +++ sys/netinet/tcp_output.c 30 Dec 2024 13:08:30 -0000 > @@ -200,7 +200,7 @@ tcp_output(struct tcpcb *tp) > u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))]; > u_char *opt = (u_char *)optbuf; > unsigned int optlen, hdrlen, packetlen; > - int idle, sendalot = 0; > + int doing_sosend, idle, sendalot = 0; > int i, sack_rxmit = 0; > struct sackhole *p; > uint64_t now; > @@ -225,6 +225,10 @@ tcp_output(struct tcpcb *tp) > > now = tcp_now(); > > + mtx_enter(&so->so_snd.sb_mtx); > + doing_sosend=soissending(so); > + mtx_leave(&so->so_snd.sb_mtx); > + > /* > * Determine length of data that should be transmitted, > * and flags that will be used. > @@ -241,7 +245,7 @@ tcp_output(struct tcpcb *tp) > tp->snd_cwnd = 2 * tp->t_maxseg; > > /* remember 'idle' for next invocation of tcp_output */ > - if (idle && soissending(so)) { > + if (idle && doing_sosend) { > tp->t_flags |= TF_LASTIDLE; > idle = 0; > } else > @@ -390,7 +394,7 @@ again: > if (len >= txmaxseg) > goto send; > if ((idle || (tp->t_flags & TF_NODELAY)) && > - len + off >= so->so_snd.sb_cc && !soissending(so) && > + len + off >= so->so_snd.sb_cc && !doing_sosend && > (tp->t_flags & TF_NOPUSH) == 0) > goto send; > if (tp->t_force) > @@ -723,7 +727,7 @@ send: > * give data to the user when a buffer fills or > * a PUSH comes in.) > */ > - if (off + len == so->so_snd.sb_cc && !soissending(so)) > + if (off + len == so->so_snd.sb_cc && !doing_sosend) > flags |= TH_PUSH; > tp->t_sndtime = now; > } else { > Index: sys/netinet/tcp_usrreq.c > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v > diff -u -p -r1.234 tcp_usrreq.c > --- sys/netinet/tcp_usrreq.c 30 Dec 2024 12:20:39 -0000 1.234 > +++ sys/netinet/tcp_usrreq.c 30 Dec 2024 13:08:30 -0000 > @@ -302,10 +302,12 @@ tcp_fill_info(struct tcpcb *tp, struct s > ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; > ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; > mtx_leave(&so->so_rcv.sb_mtx); > + mtx_enter(&so->so_snd.sb_mtx); > ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; > ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; > ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; > ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; > + mtx_leave(&so->so_snd.sb_mtx); > > return 0; > } > @@ -842,7 +844,9 @@ tcp_send(struct socket *so, struct mbuf > if (so->so_options & SO_DEBUG) > ostate = tp->t_state; > > + mtx_enter(&so->so_snd.sb_mtx); > sbappendstream(so, &so->so_snd, m); > + mtx_leave(&so->so_snd.sb_mtx); > m = NULL; > > error = tcp_output(tp); > @@ -895,7 +899,9 @@ tcp_sense(struct socket *so, struct stat > if ((error = tcp_sogetpcb(so, &inp, &tp))) > return (error); > > + mtx_enter(&so->so_snd.sb_mtx); > ub->st_blksize = so->so_snd.sb_hiwat; > + mtx_leave(&so->so_snd.sb_mtx); > > if (so->so_options & SO_DEBUG) > tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); > @@ -970,7 +976,9 @@ tcp_sendoob(struct socket *so, struct mb > * of data past the urgent section. > * Otherwise, snd_up should be one lower. > */ > + mtx_enter(&so->so_snd.sb_mtx); > sbappendstream(so, &so->so_snd, m); > + mtx_leave(&so->so_snd.sb_mtx); > m = NULL; > tp->snd_up = tp->snd_una + so->so_snd.sb_cc; > tp->t_force = 1; > @@ -1517,7 +1525,11 @@ void > tcp_update_sndspace(struct tcpcb *tp) > { > struct socket *so = tp->t_inpcb->inp_socket; > - u_long nmax = so->so_snd.sb_hiwat; > + u_long nmax; > + > + mtx_enter(&so->so_snd.sb_mtx); > + > + nmax = so->so_snd.sb_hiwat; > > if (sbchecklowmem()) { > /* low on memory try to get rid of some */ > @@ -1533,7 +1545,7 @@ tcp_update_sndspace(struct tcpcb *tp) > } > > /* a writable socket must be preserved because of poll(2) semantics */ > - if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { > + if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) { > if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) > nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; > /* keep in sync with sbreserve() calculation */ > @@ -1546,6 +1558,8 @@ tcp_update_sndspace(struct tcpcb *tp) > > if (nmax != so->so_snd.sb_hiwat) > sbreserve(so, &so->so_snd, nmax); > + > + mtx_leave(&so->so_snd.sb_mtx); > } > > /* > @@ -1579,9 +1593,11 @@ tcp_update_rcvspace(struct tcpcb *tp) > } > > /* a readable socket must be preserved because of poll(2) semantics */ > + mtx_enter(&so->so_snd.sb_mtx); > if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && > nmax < so->so_snd.sb_lowat) > nmax = so->so_snd.sb_lowat; > + mtx_leave(&so->so_snd.sb_mtx); > > if (nmax != so->so_rcv.sb_hiwat) { > /* round to MSS boundary */