Download raw body.
Don't take solock() in soreceive() for tcp(4) sockets
On Wed, Apr 17, 2024 at 02:19:42PM +0200, Alexander Bluhm wrote:
> ==== run-args-client-tls-tcp.pl ====
> time SUDO= KTRACE= SYSLOGD= perl -I/usr/src/regress/usr.sbin/syslogd -I/usr/src/regress/usr.sbin/syslogd/obj /usr/src/regress/usr.sbin/syslogd/syslogd.pl /usr/src/regress/usr.sbin/syslogd/args-client-tls-tcp.pl
> Timeout, server ot1 not responding.
>
> There are many messages:
>
> splassert: sbmtxassertlocked: want 0 have 1
> Starting stack trace...
> sbdrop(0,1,d0cf1c14) at sbdrop+0x53
> sbdrop(f67cbb84,f67cbbf0,17) at sbdrop+0x53
> sbflush(f67cbb84,f67cbbf0) at sbflush+0x40
> tcp_dodisconnect(d8af86f4) at tcp_dodisconnect+0x47
> tcp_disconnect(f67cbb84) at tcp_disconnect+0x2b
> soclose(f67cbb84,0) at soclose+0x6a
> soo_close(d7ac2074,f60ed1c0) at soo_close+0x1e
> fdrop(d7ac2074,f60ed1c0) at fdrop+0x6a
> closef(d7ac2074,f60ed1c0) at closef+0x9d
> fdrelease(f60ed1c0,3) at fdrelease+0x7a
> sys_close(f60ed1c0,f6b00ce0,f6b00cd8) at sys_close+0x54
> syscall(f6b00d20) at syscall+0x41d
> Xsyscall_untramp() at Xsyscall_untramp+0xa9
> end of kernel
> End of stack trace.
>
Thanks for testing this. `sb_mtx' was missing around sbflush() in
tcp_dodisconnect().
> Stopped at db_enter+0x4: popl %ebp
> TID PID UID PRFLAGS PFLAGS CPU COMMAND
> 63476 68916 73 0x19100010 0 3 syslogd
> *144380 89910 0 0x18000002 0 2K ttylog
> db_enter() at db_enter+0x4
> panic(d0c61c69) at panic+0x7a
> kpageflttrap(f5fdaeec,0) at kpageflttrap+0x137
> trap(f5fdaeec) at trap+0x25f
> calltrap() at calltrap+0xc
> docopyf() at docopyf+0x5
> ptcread(600,f5fdb100,0) at ptcread+0x1a7
> spec_read(f5fdb070) at spec_read+0x80
> VOP_READ(d6ba2c00,f5fdb100,0,d7b879c4) at VOP_READ+0x3c
> vn_read(d76fd774,f5fdb100,0) at vn_read+0x95
> dofilereadv(d7a0cb20,5,f5fdb100,0,f5fdb188) at dofilereadv+0xb5
> sys_read(d7a0cb20,f5fdb190,f5fdb188) at sys_read+0x49
> syscall(f5fdb1d0) at syscall+0x41d
> Xsyscall_untramp() at Xsyscall_untramp+0xa9
> end of kernel
>
> [skip]
>
> ttylog is a program used by syslogd regress to redirect console.
> I need this to test syslogd console logging. My suspicion is that
> there is a race in kernel console logging code and all these splassert
> warnings triggered the crash.
>
This doesn't crash amd64. i386 has low memory in compare with amd64, so
I suspect the same problem [1] as was exposed in fifo subsystem. May be
this vnode was reused due to missing vref().
1. https://marc.info/?l=openbsd-tech&m=171327773331714&w=2
This is the updated diff. Witness kernel passed regress/usr.sbin/syslogd
without issues.
tcp(4) is very complicated. Isn't it better to convert AF_UNIX, AF_KEY
and AF_ROUTE sockets before?
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.330
diff -u -p -r1.330 uipc_socket.c
--- sys/kern/uipc_socket.c 15 Apr 2024 21:31:29 -0000 1.330
+++ sys/kern/uipc_socket.c 17 Apr 2024 15:17:53 -0000
@@ -157,12 +157,7 @@ soalloc(const struct protosw *prp, int w
switch (dp->dom_family) {
case AF_INET:
case AF_INET6:
- switch (prp->pr_type) {
- case SOCK_DGRAM:
- case SOCK_RAW:
- so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
- break;
- }
+ so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
break;
case AF_UNIX:
so->so_rcv.sb_flags |= SB_MTXLOCK;
@@ -348,20 +343,18 @@ sofree(struct socket *so, int keep_lock)
#endif /* SOCKET_SPLICE */
sbrelease(so, &so->so_snd);
+ if (!keep_lock)
+ sounlock(so);
+
/*
- * Regardless on '_locked' postfix, must release solock() before
- * call sorflush_locked() for SB_OWNLOCK marked socket. Can't
- * release solock() and call sorflush() because solock() release
- * is unwanted for tcp(4) socket.
+ * The socket was previously unspliced. The unlocked `so_rcv'
+ * cleanup is safe even if we have concurrent somove() thread.
*/
- if (so->so_rcv.sb_flags & SB_OWNLOCK)
- sounlock(so);
-
- sorflush_locked(so);
-
- if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock))
- sounlock(so);
+ if (so->so_proto->pr_flags & PR_RIGHTS &&
+ so->so_proto->pr_domain->dom_dispose)
+ (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+ m_purge(so->so_rcv.sb_mb);
#ifdef SOCKET_SPLICE
if (so->so_sp) {
@@ -1222,7 +1215,11 @@ dontblock:
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
if (pr->pr_flags & PR_WANTRCVD) {
sb_mtx_unlock(&so->so_rcv);
+ if (!dosolock)
+ solock(so);
pru_rcvd(so);
+ if (!dosolock)
+ sounlock(so);
sb_mtx_lock(&so->so_rcv);
}
}
@@ -1358,17 +1355,9 @@ sosplice(struct socket *so, int fd, off_
membar_consumer();
}
- if (so->so_rcv.sb_flags & SB_OWNLOCK) {
- if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
- return (error);
- solock(so);
- } else {
- solock(so);
- if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
- sounlock(so);
- return (error);
- }
- }
+ if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
+ return (error);
+ solock(so);
if (so->so_options & SO_ACCEPTCONN) {
error = EOPNOTSUPP;
@@ -1446,13 +1435,8 @@ sosplice(struct socket *so, int fd, off_
release:
sbunlock(sosp, &sosp->so_snd);
out:
- if (so->so_rcv.sb_flags & SB_OWNLOCK) {
- sounlock(so);
- sbunlock(so, &so->so_rcv);
- } else {
- sbunlock(so, &so->so_rcv);
- sounlock(so);
- }
+ sounlock(so);
+ sbunlock(so, &so->so_rcv);
if (fp)
FRELE(fp, curproc);
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.149
diff -u -p -r1.149 uipc_socket2.c
--- sys/kern/uipc_socket2.c 11 Apr 2024 13:32:51 -0000 1.149
+++ sys/kern/uipc_socket2.c 17 Apr 2024 15:17:53 -0000
@@ -86,8 +86,10 @@ void
soisconnecting(struct socket *so)
{
soassertlocked(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTING;
+ mtx_leave(&so->so_rcv.sb_mtx);
}
void
@@ -96,8 +98,10 @@ soisconnected(struct socket *so)
struct socket *head = so->so_head;
soassertlocked(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTED;
+ mtx_leave(&so->so_rcv.sb_mtx);
if (head != NULL && so->so_onq == &head->so_q0) {
int persocket = solock_persocket(so);
@@ -140,9 +144,9 @@ void
soisdisconnecting(struct socket *so)
{
soassertlocked(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= SS_ISDISCONNECTING;
- mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
mtx_leave(&so->so_rcv.sb_mtx);
so->so_snd.sb_state |= SS_CANTSENDMORE;
@@ -155,9 +159,9 @@ void
soisdisconnected(struct socket *so)
{
soassertlocked(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISDISCONNECTED;
- mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
mtx_leave(&so->so_rcv.sb_mtx);
so->so_snd.sb_state |= SS_CANTSENDMORE;
@@ -874,8 +878,7 @@ sbappend(struct socket *so, struct sockb
void
sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
{
- KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
- soassertlocked(so);
+ sbmtxassertlocked(so, sb);
KDASSERT(m->m_nextpkt == NULL);
KASSERT(sb->sb_mb == sb->sb_lastrecord);
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.404
diff -u -p -r1.404 tcp_input.c
--- sys/netinet/tcp_input.c 13 Apr 2024 23:44:11 -0000 1.404
+++ sys/netinet/tcp_input.c 17 Apr 2024 15:17:53 -0000
@@ -337,10 +337,12 @@ tcp_flush_queue(struct tcpcb *tp)
nq = TAILQ_NEXT(q, tcpqe_q);
TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
ND6_HINT(tp);
+ mtx_enter(&so->so_rcv.sb_mtx);
if (so->so_rcv.sb_state & SS_CANTRCVMORE)
m_freem(q->tcpqe_m);
else
sbappendstream(so, &so->so_rcv, q->tcpqe_m);
+ mtx_leave(&so->so_rcv.sb_mtx);
pool_put(&tcpqe_pool, q);
q = nq;
} while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
@@ -1018,8 +1020,13 @@ findpcb:
return IPPROTO_DONE;
}
} else if (th->th_ack == tp->snd_una &&
- TAILQ_EMPTY(&tp->t_segq) &&
- tlen <= sbspace(so, &so->so_rcv)) {
+ TAILQ_EMPTY(&tp->t_segq)) {
+ mtx_enter(&so->so_rcv.sb_mtx);
+ if (tlen > sbspace(so, &so->so_rcv)) {
+ mtx_leave(&so->so_rcv.sb_mtx);
+ goto skip;
+ }
+
/*
* This is a pure, in-sequence data packet
* with nothing on the reassembly queue and
@@ -1057,6 +1064,7 @@ findpcb:
m_adj(m, iphlen + off);
sbappendstream(so, &so->so_rcv, m);
}
+ mtx_leave(&so->so_rcv.sb_mtx);
tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
@@ -1066,6 +1074,7 @@ findpcb:
return IPPROTO_DONE;
}
}
+skip:
/*
* Compute mbuf offset to TCP data segment.
@@ -1081,7 +1090,10 @@ findpcb:
{
int win;
+ mtx_enter(&so->so_rcv.sb_mtx);
win = sbspace(so, &so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
+
if (win < 0)
win = 0;
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
@@ -1873,6 +1885,7 @@ step6:
*/
if ((tiflags & TH_URG) && th->th_urp &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ mtx_enter(&so->so_rcv.sb_mtx);
/*
* This is a kludge, but if we receive and accept
* random urgent pointers, we'll crash in
@@ -1880,6 +1893,7 @@ step6:
* actually wanting to send this much urgent data.
*/
if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+ mtx_leave(&so->so_rcv.sb_mtx);
th->th_urp = 0; /* XXX */
tiflags &= ~TH_URG; /* XXX */
goto dodata; /* XXX */
@@ -1904,9 +1918,12 @@ step6:
(tp->rcv_up - tp->rcv_nxt) - 1;
if (so->so_oobmark == 0)
so->so_rcv.sb_state |= SS_RCVATMARK;
+ mtx_leave(&so->so_rcv.sb_mtx);
sohasoutofband(so);
tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
- }
+ } else
+ mtx_leave(&so->so_rcv.sb_mtx);
+
/*
* Remove out of band data so doesn't get presented to user.
* This can happen independent of advancing the URG pointer,
@@ -1946,12 +1963,14 @@ dodata: /* XXX */
tiflags = th->th_flags & TH_FIN;
tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
ND6_HINT(tp);
+ mtx_enter(&so->so_rcv.sb_mtx);
if (so->so_rcv.sb_state & SS_CANTRCVMORE)
m_freem(m);
else {
m_adj(m, hdroptlen);
sbappendstream(so, &so->so_rcv, m);
}
+ mtx_leave(&so->so_rcv.sb_mtx);
tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
@@ -3000,6 +3019,7 @@ tcp_mss_update(struct tcpcb *tp)
(void)sbreserve(so, &so->so_snd, bufsize);
}
+ mtx_enter(&so->so_rcv.sb_mtx);
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -3007,6 +3027,7 @@ tcp_mss_update(struct tcpcb *tp)
bufsize = sb_max;
(void)sbreserve(so, &so->so_rcv, bufsize);
}
+ mtx_leave(&so->so_rcv.sb_mtx);
}
@@ -3784,7 +3805,10 @@ syn_cache_add(struct sockaddr *src, stru
/*
* Initialize some local state.
*/
+ mtx_enter(&so->so_rcv.sb_mtx);
win = sbspace(so, &so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
+
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
Index: sys/netinet/tcp_output.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.143
diff -u -p -r1.143 tcp_output.c
--- sys/netinet/tcp_output.c 13 Feb 2024 12:22:09 -0000 1.143
+++ sys/netinet/tcp_output.c 17 Apr 2024 15:17:53 -0000
@@ -373,7 +373,9 @@ again:
if (off + len < so->so_snd.sb_cc)
flags &= ~TH_FIN;
+ mtx_enter(&so->so_rcv.sb_mtx);
win = sbspace(so, &so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
/*
* Sender silly window avoidance. If connection is idle
Index: sys/netinet/tcp_subr.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.200
diff -u -p -r1.200 tcp_subr.c
--- sys/netinet/tcp_subr.c 12 Apr 2024 16:07:09 -0000 1.200
+++ sys/netinet/tcp_subr.c 17 Apr 2024 15:17:53 -0000
@@ -311,7 +311,9 @@ tcp_respond(struct tcpcb *tp, caddr_t te
if (tp) {
struct socket *so = tp->t_inpcb->inp_socket;
+ mtx_enter(&so->so_rcv.sb_mtx);
win = sbspace(so, &so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
/*
* If this is called with an unconnected
* socket/tp/pcb (tp->pf is 0), we lose.
Index: sys/netinet/tcp_usrreq.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.231
diff -u -p -r1.231 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c 12 Apr 2024 16:07:09 -0000 1.231
+++ sys/netinet/tcp_usrreq.c 17 Apr 2024 15:17:53 -0000
@@ -907,13 +907,17 @@ tcp_rcvoob(struct socket *so, struct mbu
if ((error = tcp_sogetpcb(so, &inp, &tp)))
return (error);
+ mtx_enter(&so->so_rcv.sb_mtx);
if ((so->so_oobmark == 0 &&
(so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
so->so_options & SO_OOBINLINE ||
tp->t_oobflags & TCPOOB_HADDATA) {
+ mtx_leave(&so->so_rcv.sb_mtx);
error = EINVAL;
goto out;
}
+ mtx_leave(&so->so_rcv.sb_mtx);
+
if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
error = EWOULDBLOCK;
goto out;
@@ -1039,7 +1043,9 @@ tcp_dodisconnect(struct tcpcb *tp)
tp = tcp_drop(tp, 0);
else {
soisdisconnecting(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
sbflush(so, &so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
tp = tcp_usrclosed(tp);
if (tp)
(void) tcp_output(tp);
Don't take solock() in soreceive() for tcp(4) sockets