From: Vitaliy Makkoveev Subject: Re: Don't take solock() in soreceive() for tcp(4) sockets To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 17 Apr 2024 18:39:14 +0300 On Wed, Apr 17, 2024 at 02:19:42PM +0200, Alexander Bluhm wrote: > ==== run-args-client-tls-tcp.pl ==== > time SUDO= KTRACE= SYSLOGD= perl -I/usr/src/regress/usr.sbin/syslogd -I/usr/src/regress/usr.sbin/syslogd/obj /usr/src/regress/usr.sbin/syslogd/syslogd.pl /usr/src/regress/usr.sbin/syslogd/args-client-tls-tcp.pl > Timeout, server ot1 not responding. > > There are many messages: > > splassert: sbmtxassertlocked: want 0 have 1 > Starting stack trace... > sbdrop(0,1,d0cf1c14) at sbdrop+0x53 > sbdrop(f67cbb84,f67cbbf0,17) at sbdrop+0x53 > sbflush(f67cbb84,f67cbbf0) at sbflush+0x40 > tcp_dodisconnect(d8af86f4) at tcp_dodisconnect+0x47 > tcp_disconnect(f67cbb84) at tcp_disconnect+0x2b > soclose(f67cbb84,0) at soclose+0x6a > soo_close(d7ac2074,f60ed1c0) at soo_close+0x1e > fdrop(d7ac2074,f60ed1c0) at fdrop+0x6a > closef(d7ac2074,f60ed1c0) at closef+0x9d > fdrelease(f60ed1c0,3) at fdrelease+0x7a > sys_close(f60ed1c0,f6b00ce0,f6b00cd8) at sys_close+0x54 > syscall(f6b00d20) at syscall+0x41d > Xsyscall_untramp() at Xsyscall_untramp+0xa9 > end of kernel > End of stack trace. > Thanks for testing this. `sb_mtx' was missing around sbflush() in tcp_dodisconnect(). > Stopped at db_enter+0x4: popl %ebp > TID PID UID PRFLAGS PFLAGS CPU COMMAND > 63476 68916 73 0x19100010 0 3 syslogd > *144380 89910 0 0x18000002 0 2K ttylog > db_enter() at db_enter+0x4 > panic(d0c61c69) at panic+0x7a > kpageflttrap(f5fdaeec,0) at kpageflttrap+0x137 > trap(f5fdaeec) at trap+0x25f > calltrap() at calltrap+0xc > docopyf() at docopyf+0x5 > ptcread(600,f5fdb100,0) at ptcread+0x1a7 > spec_read(f5fdb070) at spec_read+0x80 > VOP_READ(d6ba2c00,f5fdb100,0,d7b879c4) at VOP_READ+0x3c > vn_read(d76fd774,f5fdb100,0) at vn_read+0x95 > dofilereadv(d7a0cb20,5,f5fdb100,0,f5fdb188) at dofilereadv+0xb5 > sys_read(d7a0cb20,f5fdb190,f5fdb188) at sys_read+0x49 > syscall(f5fdb1d0) at syscall+0x41d > Xsyscall_untramp() at Xsyscall_untramp+0xa9 > end of kernel > > [skip] > > ttylog is a program used by syslogd regress to redirect console. > I need this to test syslogd console logging. My suspicion is that > there is a race in kernel console logging code and all these splassert > warnings triggered the crash. > This doesn't crash amd64. i386 has low memory in compare with amd64, so I suspect the same problem [1] as was exposed in fifo subsystem. May be this vnode was reused due to missing vref(). 1. https://marc.info/?l=openbsd-tech&m=171327773331714&w=2 This is the updated diff. Witness kernel passed regress/usr.sbin/syslogd without issues. tcp(4) is very complicated. Isn't it better to convert AF_UNIX, AF_KEY and AF_ROUTE sockets before? Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.330 diff -u -p -r1.330 uipc_socket.c --- sys/kern/uipc_socket.c 15 Apr 2024 21:31:29 -0000 1.330 +++ sys/kern/uipc_socket.c 17 Apr 2024 15:17:53 -0000 @@ -157,12 +157,7 @@ soalloc(const struct protosw *prp, int w switch (dp->dom_family) { case AF_INET: case AF_INET6: - switch (prp->pr_type) { - case SOCK_DGRAM: - case SOCK_RAW: - so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; - break; - } + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; break; case AF_UNIX: so->so_rcv.sb_flags |= SB_MTXLOCK; @@ -348,20 +343,18 @@ sofree(struct socket *so, int keep_lock) #endif /* SOCKET_SPLICE */ sbrelease(so, &so->so_snd); + if (!keep_lock) + sounlock(so); + /* - * Regardless on '_locked' postfix, must release solock() before - * call sorflush_locked() for SB_OWNLOCK marked socket. Can't - * release solock() and call sorflush() because solock() release - * is unwanted for tcp(4) socket. + * The socket was previously unspliced. The unlocked `so_rcv' + * cleanup is safe even if we have concurrent somove() thread. */ - if (so->so_rcv.sb_flags & SB_OWNLOCK) - sounlock(so); - - sorflush_locked(so); - - if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock)) - sounlock(so); + if (so->so_proto->pr_flags & PR_RIGHTS && + so->so_proto->pr_domain->dom_dispose) + (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); + m_purge(so->so_rcv.sb_mb); #ifdef SOCKET_SPLICE if (so->so_sp) { @@ -1222,7 +1215,11 @@ dontblock: SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD) { sb_mtx_unlock(&so->so_rcv); + if (!dosolock) + solock(so); pru_rcvd(so); + if (!dosolock) + sounlock(so); sb_mtx_lock(&so->so_rcv); } } @@ -1358,17 +1355,9 @@ sosplice(struct socket *so, int fd, off_ membar_consumer(); } - if (so->so_rcv.sb_flags & SB_OWNLOCK) { - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) - return (error); - solock(so); - } else { - solock(so); - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { - sounlock(so); - return (error); - } - } + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) + return (error); + solock(so); if (so->so_options & SO_ACCEPTCONN) { error = EOPNOTSUPP; @@ -1446,13 +1435,8 @@ sosplice(struct socket *so, int fd, off_ release: sbunlock(sosp, &sosp->so_snd); out: - if (so->so_rcv.sb_flags & SB_OWNLOCK) { - sounlock(so); - sbunlock(so, &so->so_rcv); - } else { - sbunlock(so, &so->so_rcv); - sounlock(so); - } + sounlock(so); + sbunlock(so, &so->so_rcv); if (fp) FRELE(fp, curproc); Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.149 diff -u -p -r1.149 uipc_socket2.c --- sys/kern/uipc_socket2.c 11 Apr 2024 13:32:51 -0000 1.149 +++ sys/kern/uipc_socket2.c 17 Apr 2024 15:17:53 -0000 @@ -86,8 +86,10 @@ void soisconnecting(struct socket *so) { soassertlocked(so); + mtx_enter(&so->so_rcv.sb_mtx); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; + mtx_leave(&so->so_rcv.sb_mtx); } void @@ -96,8 +98,10 @@ soisconnected(struct socket *so) struct socket *head = so->so_head; soassertlocked(so); + mtx_enter(&so->so_rcv.sb_mtx); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; + mtx_leave(&so->so_rcv.sb_mtx); if (head != NULL && so->so_onq == &head->so_q0) { int persocket = solock_persocket(so); @@ -140,9 +144,9 @@ void soisdisconnecting(struct socket *so) { soassertlocked(so); + mtx_enter(&so->so_rcv.sb_mtx); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; mtx_leave(&so->so_rcv.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; @@ -155,9 +159,9 @@ void soisdisconnected(struct socket *so) { soassertlocked(so); + mtx_enter(&so->so_rcv.sb_mtx); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; mtx_leave(&so->so_rcv.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; @@ -874,8 +878,7 @@ sbappend(struct socket *so, struct sockb void sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) { - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - soassertlocked(so); + sbmtxassertlocked(so, sb); KDASSERT(m->m_nextpkt == NULL); KASSERT(sb->sb_mb == sb->sb_lastrecord); Index: sys/netinet/tcp_input.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.404 diff -u -p -r1.404 tcp_input.c --- sys/netinet/tcp_input.c 13 Apr 2024 23:44:11 -0000 1.404 +++ sys/netinet/tcp_input.c 17 Apr 2024 15:17:53 -0000 @@ -337,10 +337,12 @@ tcp_flush_queue(struct tcpcb *tp) nq = TAILQ_NEXT(q, tcpqe_q); TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); ND6_HINT(tp); + mtx_enter(&so->so_rcv.sb_mtx); if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(q->tcpqe_m); else sbappendstream(so, &so->so_rcv, q->tcpqe_m); + mtx_leave(&so->so_rcv.sb_mtx); pool_put(&tcpqe_pool, q); q = nq; } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); @@ -1018,8 +1020,13 @@ findpcb: return IPPROTO_DONE; } } else if (th->th_ack == tp->snd_una && - TAILQ_EMPTY(&tp->t_segq) && - tlen <= sbspace(so, &so->so_rcv)) { + TAILQ_EMPTY(&tp->t_segq)) { + mtx_enter(&so->so_rcv.sb_mtx); + if (tlen > sbspace(so, &so->so_rcv)) { + mtx_leave(&so->so_rcv.sb_mtx); + goto skip; + } + /* * This is a pure, in-sequence data packet * with nothing on the reassembly queue and @@ -1057,6 +1064,7 @@ findpcb: m_adj(m, iphlen + off); sbappendstream(so, &so->so_rcv, m); } + mtx_leave(&so->so_rcv.sb_mtx); tp->t_flags |= TF_BLOCKOUTPUT; sorwakeup(so); tp->t_flags &= ~TF_BLOCKOUTPUT; @@ -1066,6 +1074,7 @@ findpcb: return IPPROTO_DONE; } } +skip: /* * Compute mbuf offset to TCP data segment. @@ -1081,7 +1090,10 @@ findpcb: { int win; + mtx_enter(&so->so_rcv.sb_mtx); win = sbspace(so, &so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); + if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); @@ -1873,6 +1885,7 @@ step6: */ if ((tiflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { + mtx_enter(&so->so_rcv.sb_mtx); /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in @@ -1880,6 +1893,7 @@ step6: * actually wanting to send this much urgent data. */ if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + mtx_leave(&so->so_rcv.sb_mtx); th->th_urp = 0; /* XXX */ tiflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ @@ -1904,9 +1918,12 @@ step6: (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SS_RCVATMARK; + mtx_leave(&so->so_rcv.sb_mtx); sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); - } + } else + mtx_leave(&so->so_rcv.sb_mtx); + /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, @@ -1946,12 +1963,14 @@ dodata: /* XXX */ tiflags = th->th_flags & TH_FIN; tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); ND6_HINT(tp); + mtx_enter(&so->so_rcv.sb_mtx); if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(m); else { m_adj(m, hdroptlen); sbappendstream(so, &so->so_rcv, m); } + mtx_leave(&so->so_rcv.sb_mtx); tp->t_flags |= TF_BLOCKOUTPUT; sorwakeup(so); tp->t_flags &= ~TF_BLOCKOUTPUT; @@ -3000,6 +3019,7 @@ tcp_mss_update(struct tcpcb *tp) (void)sbreserve(so, &so->so_snd, bufsize); } + mtx_enter(&so->so_rcv.sb_mtx); bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); @@ -3007,6 +3027,7 @@ tcp_mss_update(struct tcpcb *tp) bufsize = sb_max; (void)sbreserve(so, &so->so_rcv, bufsize); } + mtx_leave(&so->so_rcv.sb_mtx); } @@ -3784,7 +3805,10 @@ syn_cache_add(struct sockaddr *src, stru /* * Initialize some local state. */ + mtx_enter(&so->so_rcv.sb_mtx); win = sbspace(so, &so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); + if (win > TCP_MAXWIN) win = TCP_MAXWIN; Index: sys/netinet/tcp_output.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.143 diff -u -p -r1.143 tcp_output.c --- sys/netinet/tcp_output.c 13 Feb 2024 12:22:09 -0000 1.143 +++ sys/netinet/tcp_output.c 17 Apr 2024 15:17:53 -0000 @@ -373,7 +373,9 @@ again: if (off + len < so->so_snd.sb_cc) flags &= ~TH_FIN; + mtx_enter(&so->so_rcv.sb_mtx); win = sbspace(so, &so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); /* * Sender silly window avoidance. If connection is idle Index: sys/netinet/tcp_subr.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_subr.c,v retrieving revision 1.200 diff -u -p -r1.200 tcp_subr.c --- sys/netinet/tcp_subr.c 12 Apr 2024 16:07:09 -0000 1.200 +++ sys/netinet/tcp_subr.c 17 Apr 2024 15:17:53 -0000 @@ -311,7 +311,9 @@ tcp_respond(struct tcpcb *tp, caddr_t te if (tp) { struct socket *so = tp->t_inpcb->inp_socket; + mtx_enter(&so->so_rcv.sb_mtx); win = sbspace(so, &so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); /* * If this is called with an unconnected * socket/tp/pcb (tp->pf is 0), we lose. Index: sys/netinet/tcp_usrreq.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.231 diff -u -p -r1.231 tcp_usrreq.c --- sys/netinet/tcp_usrreq.c 12 Apr 2024 16:07:09 -0000 1.231 +++ sys/netinet/tcp_usrreq.c 17 Apr 2024 15:17:53 -0000 @@ -907,13 +907,17 @@ tcp_rcvoob(struct socket *so, struct mbu if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); + mtx_enter(&so->so_rcv.sb_mtx); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { + mtx_leave(&so->so_rcv.sb_mtx); error = EINVAL; goto out; } + mtx_leave(&so->so_rcv.sb_mtx); + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; @@ -1039,7 +1043,9 @@ tcp_dodisconnect(struct tcpcb *tp) tp = tcp_drop(tp, 0); else { soisdisconnecting(so); + mtx_enter(&so->so_rcv.sb_mtx); sbflush(so, &so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); tp = tcp_usrclosed(tp); if (tp) (void) tcp_output(tp);