Download raw body.
send socket buffer mutex in tcp_mss_update()
> On 24 Dec 2024, at 20:31, Alexander Bluhm <alexander.bluhm@gmx.net> wrote:
>
> On Tue, Dec 24, 2024 at 07:33:47PM +0300, Vitaliy Makkoveev wrote:
>> On Tue, Dec 24, 2024 at 02:43:37PM +0100, Alexander Bluhm wrote:
>>> Hi,
>>>
>>> I found this fix in my larger TCP output unlocking diff. Vitaliy,
>>> you address the same issue in your larger socket buffer mutex diff.
>>> I think it is better release the mutex before calling tcp_mss().
>>> There may be another route lookup, and I want to keep that without
>>> needless mutex.
>>>
>>> ok?
>>
>> You are right, lock is useless for the "bufsize < mss" branch. However,
>> isn't it better to update and commit my `so_snd' unlocking diff instead
>> of the only hunk?
>
> I come from the other direction. My goal is to set PR_MPSOCKET for
> TCP. And the diff I sent is the final chunk that is missing to
> make parallel tcp_output() work.
>
> As I see panics with the diff below, I would like to get the
> tcp_mss_update() chunk in independently. This makes it easier to
> test my TCP PR_MPSOCKET diff and your TCP SB_MTXLOCK diff. My diff
> is stable, yours not. Maybe we should review and commit the diff
> below in smaller chunks to understand easier why my test machine
> panics.
>
> bluhm
>
Sure, commit this diff as is.
>> Index: sys/kern/uipc_socket.c
>> ===================================================================
>> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
>> diff -u -p -r1.347 uipc_socket.c
>> --- sys/kern/uipc_socket.c 19 Dec 2024 22:11:35 -0000 1.347
>> +++ sys/kern/uipc_socket.c 24 Dec 2024 16:22:36 -0000
>> @@ -153,27 +153,8 @@ soalloc(const struct protosw *prp, int w
>> TAILQ_INIT(&so->so_q0);
>> TAILQ_INIT(&so->so_q);
>>
>> - switch (dp->dom_family) {
>> - case AF_INET:
>> - case AF_INET6:
>> - switch (prp->pr_type) {
>> - case SOCK_RAW:
>> - case SOCK_DGRAM:
>> - so->so_snd.sb_flags |= SB_MTXLOCK;
>> - /* FALLTHROUGH */
>> - case SOCK_STREAM:
>> - so->so_rcv.sb_flags |= SB_MTXLOCK;
>> - break;
>> - }
>> - break;
>> - case AF_KEY:
>> - case AF_ROUTE:
>> - case AF_UNIX:
>> - case AF_FRAME:
>> - so->so_snd.sb_flags |= SB_MTXLOCK;
>> - so->so_rcv.sb_flags |= SB_MTXLOCK;
>> - break;
>> - }
>> + so->so_snd.sb_flags |= SB_MTXLOCK;
>> + so->so_rcv.sb_flags |= SB_MTXLOCK;
>>
>> return (so);
>> }
>> @@ -327,17 +308,13 @@ sofree(struct socket *so, int keep_lock)
>> sounlock(head);
>> }
>>
>> - switch (so->so_proto->pr_domain->dom_family) {
>> - case AF_INET:
>> - case AF_INET6:
>> - if (so->so_proto->pr_type == SOCK_STREAM)
>> - break;
>> - /* FALLTHROUGH */
>> - default:
>> + if (!keep_lock) {
>> + /*
>> + * sofree() was called from soclose(). Sleep is safe
>> + * even for tcp(4) sockets.
>> + */
>> sounlock(so);
>> refcnt_finalize(&so->so_refcnt, "sofinal");
>> - solock(so);
>> - break;
>> }
>>
>> sigio_free(&so->so_sigio);
>> @@ -358,9 +335,6 @@ sofree(struct socket *so, int keep_lock)
>> (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
>> m_purge(so->so_rcv.sb_mb);
>>
>> - if (!keep_lock)
>> - sounlock(so);
>> -
>> #ifdef SOCKET_SPLICE
>> if (so->so_sp) {
>> /* Reuse splice idle, sounsplice() has been called before. */
>> @@ -458,31 +432,6 @@ discard:
>> if (so->so_sp) {
>> struct socket *soback;
>>
>> - if (so->so_proto->pr_flags & PR_WANTRCVD) {
>> - /*
>> - * Copy - Paste, but can't relock and sleep in
>> - * sofree() in tcp(4) case. That's why tcp(4)
>> - * still rely on solock() for splicing and
>> - * unsplicing.
>> - */
>> -
>> - if (issplicedback(so)) {
>> - int freeing = SOSP_FREEING_WRITE;
>> -
>> - if (so->so_sp->ssp_soback == so)
>> - freeing |= SOSP_FREEING_READ;
>> - sounsplice(so->so_sp->ssp_soback, so, freeing);
>> - }
>> - if (isspliced(so)) {
>> - int freeing = SOSP_FREEING_READ;
>> -
>> - if (so == so->so_sp->ssp_socket)
>> - freeing |= SOSP_FREEING_WRITE;
>> - sounsplice(so, so->so_sp->ssp_socket, freeing);
>> - }
>> - goto free;
>> - }
>> -
>> sounlock(so);
>> mtx_enter(&so->so_snd.sb_mtx);
>> /*
>> @@ -530,7 +479,6 @@ notsplicedback:
>>
>> solock(so);
>> }
>> -free:
>> #endif /* SOCKET_SPLICE */
>> /* sofree() calls sounlock(). */
>> sofree(so, 0);
>> @@ -1587,17 +1535,22 @@ sotask(void *arg)
>> */
>>
>> sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
>> - if (sockstream)
>> - solock(so);
>> -
>> if (so->so_rcv.sb_flags & SB_SPLICE) {
>> - if (sockstream)
>> + struct socket *sosp = so->so_sp->ssp_socket;
>> +
>> + if (sockstream) {
>> + sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR);
>> + solock(so);
>> doyield = 1;
>> + }
>> +
>> somove(so, M_DONTWAIT);
>> - }
>>
>> - if (sockstream)
>> - sounlock(so);
>> + if (sockstream) {
>> + sounlock(so);
>> + sbunlock(&sosp->so_snd);
>> + }
>> + }
>> sbunlock(&so->so_rcv);
>>
>> if (doyield) {
>> Index: sys/netinet/tcp_input.c
>> ===================================================================
>> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
>> diff -u -p -r1.411 tcp_input.c
>> --- sys/netinet/tcp_input.c 24 Dec 2024 12:22:17 -0000 1.411
>> +++ sys/netinet/tcp_input.c 24 Dec 2024 16:22:36 -0000
>> @@ -957,7 +957,10 @@ findpcb:
>> acked);
>> tp->t_rcvacktime = now;
>> ND6_HINT(tp);
>> +
>> + mtx_enter(&so->so_snd.sb_mtx);
>> sbdrop(so, &so->so_snd, acked);
>> + mtx_leave(&so->so_snd.sb_mtx);
>>
>> /*
>> * If we had a pending ICMP message that
>> @@ -1738,10 +1741,14 @@ trimthenstep6:
>> tp->snd_wnd -= so->so_snd.sb_cc;
>> else
>> tp->snd_wnd = 0;
>> + mtx_enter(&so->so_snd.sb_mtx);
>> sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc);
>> + mtx_leave(&so->so_snd.sb_mtx);
>> ourfinisacked = 1;
>> } else {
>> + mtx_enter(&so->so_snd.sb_mtx);
>> sbdrop(so, &so->so_snd, acked);
>> + mtx_leave(&so->so_snd.sb_mtx);
>> if (tp->snd_wnd > acked)
>> tp->snd_wnd -= acked;
>> else
>> @@ -2995,8 +3002,10 @@ tcp_mss_update(struct tcpcb *tp)
>> if (rt == NULL)
>> return;
>>
>> + mtx_enter(&so->so_snd.sb_mtx);
>> bufsize = so->so_snd.sb_hiwat;
>> if (bufsize < mss) {
>> + mtx_leave(&so->so_snd.sb_mtx);
>> mss = bufsize;
>> /* Update t_maxseg and t_maxopd */
>> tcp_mss(tp, mss);
>> @@ -3005,6 +3014,7 @@ tcp_mss_update(struct tcpcb *tp)
>> if (bufsize > sb_max)
>> bufsize = sb_max;
>> (void)sbreserve(so, &so->so_snd, bufsize);
>> + mtx_leave(&so->so_snd.sb_mtx);
>> }
>>
>> mtx_enter(&so->so_rcv.sb_mtx);
>> Index: sys/netinet/tcp_output.c
>> ===================================================================
>> RCS file: /cvs/src/sys/netinet/tcp_output.c,v
>> diff -u -p -r1.146 tcp_output.c
>> --- sys/netinet/tcp_output.c 19 Dec 2024 22:11:35 -0000 1.146
>> +++ sys/netinet/tcp_output.c 24 Dec 2024 16:22:36 -0000
>> @@ -202,7 +202,7 @@ tcp_output(struct tcpcb *tp)
>> u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))];
>> u_char *opt = (u_char *)optbuf;
>> unsigned int optlen, hdrlen, packetlen;
>> - int idle, sendalot = 0;
>> + int doing_sosend, idle, sendalot = 0;
>> int i, sack_rxmit = 0;
>> struct sackhole *p;
>> uint64_t now;
>> @@ -227,6 +227,10 @@ tcp_output(struct tcpcb *tp)
>>
>> now = tcp_now();
>>
>> + mtx_enter(&so->so_snd.sb_mtx);
>> + doing_sosend=soissending(so);
>> + mtx_leave(&so->so_snd.sb_mtx);
>> +
>> /*
>> * Determine length of data that should be transmitted,
>> * and flags that will be used.
>> @@ -243,7 +247,7 @@ tcp_output(struct tcpcb *tp)
>> tp->snd_cwnd = 2 * tp->t_maxseg;
>>
>> /* remember 'idle' for next invocation of tcp_output */
>> - if (idle && soissending(so)) {
>> + if (idle && doing_sosend) {
>> tp->t_flags |= TF_LASTIDLE;
>> idle = 0;
>> } else
>> @@ -392,7 +396,7 @@ again:
>> if (len >= txmaxseg)
>> goto send;
>> if ((idle || (tp->t_flags & TF_NODELAY)) &&
>> - len + off >= so->so_snd.sb_cc && !soissending(so) &&
>> + len + off >= so->so_snd.sb_cc && !doing_sosend &&
>> (tp->t_flags & TF_NOPUSH) == 0)
>> goto send;
>> if (tp->t_force)
>> @@ -725,7 +729,7 @@ send:
>> * give data to the user when a buffer fills or
>> * a PUSH comes in.)
>> */
>> - if (off + len == so->so_snd.sb_cc && !soissending(so))
>> + if (off + len == so->so_snd.sb_cc && !doing_sosend)
>> flags |= TH_PUSH;
>> tp->t_sndtime = now;
>> } else {
>> Index: sys/netinet/tcp_usrreq.c
>> ===================================================================
>> RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
>> diff -u -p -r1.233 tcp_usrreq.c
>> --- sys/netinet/tcp_usrreq.c 19 Dec 2024 22:11:35 -0000 1.233
>> +++ sys/netinet/tcp_usrreq.c 24 Dec 2024 16:22:36 -0000
>> @@ -302,10 +302,12 @@ tcp_fill_info(struct tcpcb *tp, struct s
>> ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
>> ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
>> mtx_leave(&so->so_rcv.sb_mtx);
>> + mtx_enter(&so->so_snd.sb_mtx);
>> ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
>> ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
>> ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
>> ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
>> + mtx_leave(&so->so_snd.sb_mtx);
>>
>> return 0;
>> }
>> @@ -842,7 +844,9 @@ tcp_send(struct socket *so, struct mbuf
>> if (so->so_options & SO_DEBUG)
>> ostate = tp->t_state;
>>
>> + mtx_enter(&so->so_snd.sb_mtx);
>> sbappendstream(so, &so->so_snd, m);
>> + mtx_leave(&so->so_snd.sb_mtx);
>> m = NULL;
>>
>> error = tcp_output(tp);
>> @@ -895,7 +899,9 @@ tcp_sense(struct socket *so, struct stat
>> if ((error = tcp_sogetpcb(so, &inp, &tp)))
>> return (error);
>>
>> + mtx_enter(&so->so_snd.sb_mtx);
>> ub->st_blksize = so->so_snd.sb_hiwat;
>> + mtx_leave(&so->so_snd.sb_mtx);
>>
>> if (so->so_options & SO_DEBUG)
>> tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
>> @@ -970,7 +976,9 @@ tcp_sendoob(struct socket *so, struct mb
>> * of data past the urgent section.
>> * Otherwise, snd_up should be one lower.
>> */
>> + mtx_enter(&so->so_snd.sb_mtx);
>> sbappendstream(so, &so->so_snd, m);
>> + mtx_leave(&so->so_snd.sb_mtx);
>> m = NULL;
>> tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
>> tp->t_force = 1;
>> @@ -1519,7 +1527,11 @@ void
>> tcp_update_sndspace(struct tcpcb *tp)
>> {
>> struct socket *so = tp->t_inpcb->inp_socket;
>> - u_long nmax = so->so_snd.sb_hiwat;
>> + u_long nmax;
>> +
>> + mtx_enter(&so->so_snd.sb_mtx);
>> +
>> + nmax = so->so_snd.sb_hiwat;
>>
>> if (sbchecklowmem()) {
>> /* low on memory try to get rid of some */
>> @@ -1535,7 +1547,7 @@ tcp_update_sndspace(struct tcpcb *tp)
>> }
>>
>> /* a writable socket must be preserved because of poll(2) semantics */
>> - if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
>> + if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) {
>> if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
>> nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
>> /* keep in sync with sbreserve() calculation */
>> @@ -1548,6 +1560,8 @@ tcp_update_sndspace(struct tcpcb *tp)
>>
>> if (nmax != so->so_snd.sb_hiwat)
>> sbreserve(so, &so->so_snd, nmax);
>> +
>> + mtx_leave(&so->so_snd.sb_mtx);
>> }
>>
>> /*
>> @@ -1581,9 +1595,11 @@ tcp_update_rcvspace(struct tcpcb *tp)
>> }
>>
>> /* a readable socket must be preserved because of poll(2) semantics */
>> + mtx_enter(&so->so_snd.sb_mtx);
>> if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
>> nmax < so->so_snd.sb_lowat)
>> nmax = so->so_snd.sb_lowat;
>> + mtx_leave(&so->so_snd.sb_mtx);
>>
>> if (nmax != so->so_rcv.sb_hiwat) {
>> /* round to MSS boundary */
send socket buffer mutex in tcp_mss_update()