Download raw body.
Don't take solock() in soreceive() for tcp(4) sockets
On Wed, Apr 17, 2024 at 06:39:14PM +0300, Vitaliy Makkoveev wrote:
> This is the updated diff. Witness kernel passed regress/usr.sbin/syslogd
> without issues.
I see a witness error with this diff. Maybe it was there before
and I missed it due to the regular findings.
witness: lock order reversal:
1st 0xf9853a0c vmmaplk (&map->lock)
2nd 0xf93fcd74 sbufrcv (&so->so_rcv.sb_lock)
lock order "&so->so_rcv.sb_lock"(rwlock) -> "&map->lock"(rwlock) first seen at:
#0 witness_checkorder+0x291
#1 rw_enter_read+0x32
#2 vm_map_lock_read_ln+0x15
#3 uvmfault_lookup+0x90
#4 uvm_fault_check+0x14
#5 uvm_fault+0xe4
#6 kpageflttrap+0xe5
#7 trap+0x25f
#8 calltrap+0xc
#9 copyout+0x42
#10 uiomove+0x1c0
#11 soreceive+0x841
#12 soo_read+0x37
#13 dofilereadv+0xb5
#14 sys_read+0x49
#15 syscall+0x41d
#16 Xsyscall_untramp+0xa9
lock order data w1 -> w2 missing
> tcp(4) is very complicated. Isn't it better to convert AF_UNIX, AF_KEY
> and AF_ROUTE sockets before?
I doubt that we will unlock TCP soon. But if we manage to make
progress in some parts of TCP that is fine. We have to start
somewhere.
> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.330
> diff -u -p -r1.330 uipc_socket.c
> --- sys/kern/uipc_socket.c 15 Apr 2024 21:31:29 -0000 1.330
> +++ sys/kern/uipc_socket.c 17 Apr 2024 15:17:53 -0000
> @@ -157,12 +157,7 @@ soalloc(const struct protosw *prp, int w
> switch (dp->dom_family) {
> case AF_INET:
> case AF_INET6:
> - switch (prp->pr_type) {
> - case SOCK_DGRAM:
> - case SOCK_RAW:
> - so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
> - break;
> - }
> + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
> break;
> case AF_UNIX:
> so->so_rcv.sb_flags |= SB_MTXLOCK;
> @@ -348,20 +343,18 @@ sofree(struct socket *so, int keep_lock)
> #endif /* SOCKET_SPLICE */
> sbrelease(so, &so->so_snd);
>
> + if (!keep_lock)
> + sounlock(so);
> +
> /*
> - * Regardless on '_locked' postfix, must release solock() before
> - * call sorflush_locked() for SB_OWNLOCK marked socket. Can't
> - * release solock() and call sorflush() because solock() release
> - * is unwanted for tcp(4) socket.
> + * The socket was previously unspliced. The unlocked `so_rcv'
> + * cleanup is safe even if we have concurrent somove() thread.
> */
>
> - if (so->so_rcv.sb_flags & SB_OWNLOCK)
> - sounlock(so);
> -
> - sorflush_locked(so);
> -
> - if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock))
> - sounlock(so);
> + if (so->so_proto->pr_flags & PR_RIGHTS &&
> + so->so_proto->pr_domain->dom_dispose)
> + (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
> + m_purge(so->so_rcv.sb_mb);
>
> #ifdef SOCKET_SPLICE
> if (so->so_sp) {
> @@ -1222,7 +1215,11 @@ dontblock:
> SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
> if (pr->pr_flags & PR_WANTRCVD) {
> sb_mtx_unlock(&so->so_rcv);
> + if (!dosolock)
> + solock(so);
> pru_rcvd(so);
> + if (!dosolock)
> + sounlock(so);
> sb_mtx_lock(&so->so_rcv);
> }
> }
> @@ -1358,17 +1355,9 @@ sosplice(struct socket *so, int fd, off_
> membar_consumer();
> }
>
> - if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> - return (error);
> - solock(so);
> - } else {
> - solock(so);
> - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> - sounlock(so);
> - return (error);
> - }
> - }
> + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> + return (error);
> + solock(so);
>
> if (so->so_options & SO_ACCEPTCONN) {
> error = EOPNOTSUPP;
> @@ -1446,13 +1435,8 @@ sosplice(struct socket *so, int fd, off_
> release:
> sbunlock(sosp, &sosp->so_snd);
> out:
> - if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> - sounlock(so);
> - sbunlock(so, &so->so_rcv);
> - } else {
> - sbunlock(so, &so->so_rcv);
> - sounlock(so);
> - }
> + sounlock(so);
> + sbunlock(so, &so->so_rcv);
>
> if (fp)
> FRELE(fp, curproc);
> Index: sys/kern/uipc_socket2.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.149
> diff -u -p -r1.149 uipc_socket2.c
> --- sys/kern/uipc_socket2.c 11 Apr 2024 13:32:51 -0000 1.149
> +++ sys/kern/uipc_socket2.c 17 Apr 2024 15:17:53 -0000
> @@ -86,8 +86,10 @@ void
> soisconnecting(struct socket *so)
> {
> soassertlocked(so);
> + mtx_enter(&so->so_rcv.sb_mtx);
> so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
> so->so_state |= SS_ISCONNECTING;
> + mtx_leave(&so->so_rcv.sb_mtx);
> }
>
> void
> @@ -96,8 +98,10 @@ soisconnected(struct socket *so)
> struct socket *head = so->so_head;
>
> soassertlocked(so);
> + mtx_enter(&so->so_rcv.sb_mtx);
> so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
> so->so_state |= SS_ISCONNECTED;
> + mtx_leave(&so->so_rcv.sb_mtx);
>
> if (head != NULL && so->so_onq == &head->so_q0) {
> int persocket = solock_persocket(so);
> @@ -140,9 +144,9 @@ void
> soisdisconnecting(struct socket *so)
> {
> soassertlocked(so);
> + mtx_enter(&so->so_rcv.sb_mtx);
> so->so_state &= ~SS_ISCONNECTING;
> so->so_state |= SS_ISDISCONNECTING;
> - mtx_enter(&so->so_rcv.sb_mtx);
> so->so_rcv.sb_state |= SS_CANTRCVMORE;
> mtx_leave(&so->so_rcv.sb_mtx);
> so->so_snd.sb_state |= SS_CANTSENDMORE;
> @@ -155,9 +159,9 @@ void
> soisdisconnected(struct socket *so)
> {
> soassertlocked(so);
> + mtx_enter(&so->so_rcv.sb_mtx);
> so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
> so->so_state |= SS_ISDISCONNECTED;
> - mtx_enter(&so->so_rcv.sb_mtx);
> so->so_rcv.sb_state |= SS_CANTRCVMORE;
> mtx_leave(&so->so_rcv.sb_mtx);
> so->so_snd.sb_state |= SS_CANTSENDMORE;
> @@ -874,8 +878,7 @@ sbappend(struct socket *so, struct sockb
> void
> sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
> {
> - KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> - soassertlocked(so);
> + sbmtxassertlocked(so, sb);
> KDASSERT(m->m_nextpkt == NULL);
> KASSERT(sb->sb_mb == sb->sb_lastrecord);
>
> Index: sys/netinet/tcp_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> retrieving revision 1.404
> diff -u -p -r1.404 tcp_input.c
> --- sys/netinet/tcp_input.c 13 Apr 2024 23:44:11 -0000 1.404
> +++ sys/netinet/tcp_input.c 17 Apr 2024 15:17:53 -0000
> @@ -337,10 +337,12 @@ tcp_flush_queue(struct tcpcb *tp)
> nq = TAILQ_NEXT(q, tcpqe_q);
> TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
> ND6_HINT(tp);
> + mtx_enter(&so->so_rcv.sb_mtx);
> if (so->so_rcv.sb_state & SS_CANTRCVMORE)
> m_freem(q->tcpqe_m);
> else
> sbappendstream(so, &so->so_rcv, q->tcpqe_m);
> + mtx_leave(&so->so_rcv.sb_mtx);
> pool_put(&tcpqe_pool, q);
> q = nq;
> } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
> @@ -1018,8 +1020,13 @@ findpcb:
> return IPPROTO_DONE;
> }
> } else if (th->th_ack == tp->snd_una &&
> - TAILQ_EMPTY(&tp->t_segq) &&
> - tlen <= sbspace(so, &so->so_rcv)) {
> + TAILQ_EMPTY(&tp->t_segq)) {
> + mtx_enter(&so->so_rcv.sb_mtx);
> + if (tlen > sbspace(so, &so->so_rcv)) {
> + mtx_leave(&so->so_rcv.sb_mtx);
> + goto skip;
> + }
> +
> /*
> * This is a pure, in-sequence data packet
> * with nothing on the reassembly queue and
> @@ -1057,6 +1064,7 @@ findpcb:
> m_adj(m, iphlen + off);
> sbappendstream(so, &so->so_rcv, m);
> }
> + mtx_leave(&so->so_rcv.sb_mtx);
> tp->t_flags |= TF_BLOCKOUTPUT;
> sorwakeup(so);
> tp->t_flags &= ~TF_BLOCKOUTPUT;
> @@ -1066,6 +1074,7 @@ findpcb:
> return IPPROTO_DONE;
> }
> }
> +skip:
>
> /*
> * Compute mbuf offset to TCP data segment.
> @@ -1081,7 +1090,10 @@ findpcb:
> {
> int win;
>
> + mtx_enter(&so->so_rcv.sb_mtx);
> win = sbspace(so, &so->so_rcv);
> + mtx_leave(&so->so_rcv.sb_mtx);
> +
> if (win < 0)
> win = 0;
> tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
> @@ -1873,6 +1885,7 @@ step6:
> */
> if ((tiflags & TH_URG) && th->th_urp &&
> TCPS_HAVERCVDFIN(tp->t_state) == 0) {
> + mtx_enter(&so->so_rcv.sb_mtx);
> /*
> * This is a kludge, but if we receive and accept
> * random urgent pointers, we'll crash in
> @@ -1880,6 +1893,7 @@ step6:
> * actually wanting to send this much urgent data.
> */
> if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
> + mtx_leave(&so->so_rcv.sb_mtx);
> th->th_urp = 0; /* XXX */
> tiflags &= ~TH_URG; /* XXX */
> goto dodata; /* XXX */
> @@ -1904,9 +1918,12 @@ step6:
> (tp->rcv_up - tp->rcv_nxt) - 1;
> if (so->so_oobmark == 0)
> so->so_rcv.sb_state |= SS_RCVATMARK;
> + mtx_leave(&so->so_rcv.sb_mtx);
> sohasoutofband(so);
> tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
> - }
> + } else
> + mtx_leave(&so->so_rcv.sb_mtx);
> +
> /*
> * Remove out of band data so doesn't get presented to user.
> * This can happen independent of advancing the URG pointer,
> @@ -1946,12 +1963,14 @@ dodata: /* XXX */
> tiflags = th->th_flags & TH_FIN;
> tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
> ND6_HINT(tp);
> + mtx_enter(&so->so_rcv.sb_mtx);
> if (so->so_rcv.sb_state & SS_CANTRCVMORE)
> m_freem(m);
> else {
> m_adj(m, hdroptlen);
> sbappendstream(so, &so->so_rcv, m);
> }
> + mtx_leave(&so->so_rcv.sb_mtx);
> tp->t_flags |= TF_BLOCKOUTPUT;
> sorwakeup(so);
> tp->t_flags &= ~TF_BLOCKOUTPUT;
> @@ -3000,6 +3019,7 @@ tcp_mss_update(struct tcpcb *tp)
> (void)sbreserve(so, &so->so_snd, bufsize);
> }
>
> + mtx_enter(&so->so_rcv.sb_mtx);
> bufsize = so->so_rcv.sb_hiwat;
> if (bufsize > mss) {
> bufsize = roundup(bufsize, mss);
> @@ -3007,6 +3027,7 @@ tcp_mss_update(struct tcpcb *tp)
> bufsize = sb_max;
> (void)sbreserve(so, &so->so_rcv, bufsize);
> }
> + mtx_leave(&so->so_rcv.sb_mtx);
>
> }
>
> @@ -3784,7 +3805,10 @@ syn_cache_add(struct sockaddr *src, stru
> /*
> * Initialize some local state.
> */
> + mtx_enter(&so->so_rcv.sb_mtx);
> win = sbspace(so, &so->so_rcv);
> + mtx_leave(&so->so_rcv.sb_mtx);
> +
> if (win > TCP_MAXWIN)
> win = TCP_MAXWIN;
>
> Index: sys/netinet/tcp_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_output.c,v
> retrieving revision 1.143
> diff -u -p -r1.143 tcp_output.c
> --- sys/netinet/tcp_output.c 13 Feb 2024 12:22:09 -0000 1.143
> +++ sys/netinet/tcp_output.c 17 Apr 2024 15:17:53 -0000
> @@ -373,7 +373,9 @@ again:
> if (off + len < so->so_snd.sb_cc)
> flags &= ~TH_FIN;
>
> + mtx_enter(&so->so_rcv.sb_mtx);
> win = sbspace(so, &so->so_rcv);
> + mtx_leave(&so->so_rcv.sb_mtx);
>
> /*
> * Sender silly window avoidance. If connection is idle
> Index: sys/netinet/tcp_subr.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_subr.c,v
> retrieving revision 1.200
> diff -u -p -r1.200 tcp_subr.c
> --- sys/netinet/tcp_subr.c 12 Apr 2024 16:07:09 -0000 1.200
> +++ sys/netinet/tcp_subr.c 17 Apr 2024 15:17:53 -0000
> @@ -311,7 +311,9 @@ tcp_respond(struct tcpcb *tp, caddr_t te
>
> if (tp) {
> struct socket *so = tp->t_inpcb->inp_socket;
> + mtx_enter(&so->so_rcv.sb_mtx);
> win = sbspace(so, &so->so_rcv);
> + mtx_leave(&so->so_rcv.sb_mtx);
> /*
> * If this is called with an unconnected
> * socket/tp/pcb (tp->pf is 0), we lose.
> Index: sys/netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
> retrieving revision 1.231
> diff -u -p -r1.231 tcp_usrreq.c
> --- sys/netinet/tcp_usrreq.c 12 Apr 2024 16:07:09 -0000 1.231
> +++ sys/netinet/tcp_usrreq.c 17 Apr 2024 15:17:53 -0000
> @@ -907,13 +907,17 @@ tcp_rcvoob(struct socket *so, struct mbu
> if ((error = tcp_sogetpcb(so, &inp, &tp)))
> return (error);
>
> + mtx_enter(&so->so_rcv.sb_mtx);
> if ((so->so_oobmark == 0 &&
> (so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
> so->so_options & SO_OOBINLINE ||
> tp->t_oobflags & TCPOOB_HADDATA) {
> + mtx_leave(&so->so_rcv.sb_mtx);
> error = EINVAL;
> goto out;
> }
> + mtx_leave(&so->so_rcv.sb_mtx);
> +
> if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
> error = EWOULDBLOCK;
> goto out;
> @@ -1039,7 +1043,9 @@ tcp_dodisconnect(struct tcpcb *tp)
> tp = tcp_drop(tp, 0);
> else {
> soisdisconnecting(so);
> + mtx_enter(&so->so_rcv.sb_mtx);
> sbflush(so, &so->so_rcv);
> + mtx_leave(&so->so_rcv.sb_mtx);
> tp = tcp_usrclosed(tp);
> if (tp)
> (void) tcp_output(tp);
Don't take solock() in soreceive() for tcp(4) sockets