Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Don't take solock() in soreceive() for tcp(4) sockets
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Fri, 19 Apr 2024 23:51:13 +0200

Download raw body.

Thread
On Wed, Apr 17, 2024 at 06:39:14PM +0300, Vitaliy Makkoveev wrote:
> This is the updated diff. Witness kernel passed regress/usr.sbin/syslogd
> without issues.

I see a witness error with this diff.  Maybe it was there before
and I missed it due to the regular findings.

witness: lock order reversal:
 1st 0xf9853a0c vmmaplk (&map->lock)
 2nd 0xf93fcd74 sbufrcv (&so->so_rcv.sb_lock)
lock order "&so->so_rcv.sb_lock"(rwlock) -> "&map->lock"(rwlock) first seen at:
#0  witness_checkorder+0x291
#1  rw_enter_read+0x32
#2  vm_map_lock_read_ln+0x15
#3  uvmfault_lookup+0x90
#4  uvm_fault_check+0x14
#5  uvm_fault+0xe4
#6  kpageflttrap+0xe5
#7  trap+0x25f
#8  calltrap+0xc
#9  copyout+0x42
#10 uiomove+0x1c0
#11 soreceive+0x841
#12 soo_read+0x37
#13 dofilereadv+0xb5
#14 sys_read+0x49
#15 syscall+0x41d
#16 Xsyscall_untramp+0xa9
lock order data w1 -> w2 missing

> tcp(4) is very complicated. Isn't it better to convert AF_UNIX, AF_KEY
> and AF_ROUTE sockets before?

I doubt that we will unlock TCP soon.  But if we manage to make
progress in some parts of TCP that is fine.  We have to start
somewhere.

> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.330
> diff -u -p -r1.330 uipc_socket.c
> --- sys/kern/uipc_socket.c	15 Apr 2024 21:31:29 -0000	1.330
> +++ sys/kern/uipc_socket.c	17 Apr 2024 15:17:53 -0000
> @@ -157,12 +157,7 @@ soalloc(const struct protosw *prp, int w
>  	switch (dp->dom_family) {
>  	case AF_INET:
>  	case AF_INET6:
> -		switch (prp->pr_type) {
> -		case SOCK_DGRAM:
> -		case SOCK_RAW:
> -			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
> -			break;
> -		}
> +		so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
>  		break;
>  	case AF_UNIX:
>  		so->so_rcv.sb_flags |= SB_MTXLOCK;
> @@ -348,20 +343,18 @@ sofree(struct socket *so, int keep_lock)
>  #endif /* SOCKET_SPLICE */
>  	sbrelease(so, &so->so_snd);
>  
> +	if (!keep_lock)
> +		sounlock(so);
> +
>  	/*
> -	 * Regardless on '_locked' postfix, must release solock() before
> -	 * call sorflush_locked() for SB_OWNLOCK marked socket. Can't
> -	 * release solock() and call sorflush() because solock() release
> -	 * is unwanted for tcp(4) socket. 
> +	 * The socket was previously unspliced. The unlocked `so_rcv'
> +	 * cleanup is safe even if we have concurrent somove() thread.
>  	 */
>  
> -	if (so->so_rcv.sb_flags & SB_OWNLOCK)
> -		sounlock(so);
> -
> -	sorflush_locked(so);
> -
> -	if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock))
> -		sounlock(so);
> +	if (so->so_proto->pr_flags & PR_RIGHTS &&
> +	    so->so_proto->pr_domain->dom_dispose)
> +		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
> +	m_purge(so->so_rcv.sb_mb);
>  
>  #ifdef SOCKET_SPLICE
>  	if (so->so_sp) {
> @@ -1222,7 +1215,11 @@ dontblock:
>  		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
>  		if (pr->pr_flags & PR_WANTRCVD) {
>  			sb_mtx_unlock(&so->so_rcv);
> +			if (!dosolock)
> +				solock(so);
>  			pru_rcvd(so);
> +			if (!dosolock)
> +				sounlock(so);
>  			sb_mtx_lock(&so->so_rcv);
>  		}
>  	}
> @@ -1358,17 +1355,9 @@ sosplice(struct socket *so, int fd, off_
>  		membar_consumer();
>  	}
>  
> -	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> -		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> -			return (error);
> -		solock(so);
> -	} else {
> -		solock(so);
> -		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> -			sounlock(so);
> -			return (error);
> -		}
> -	}
> +	if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> +		return (error);
> +	solock(so);
>  
>  	if (so->so_options & SO_ACCEPTCONN) {
>  		error = EOPNOTSUPP;
> @@ -1446,13 +1435,8 @@ sosplice(struct socket *so, int fd, off_
>   release:
>  	sbunlock(sosp, &sosp->so_snd);
>   out:
> -	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> -		sounlock(so);
> -		sbunlock(so, &so->so_rcv);
> -	} else {
> -		sbunlock(so, &so->so_rcv);
> -		sounlock(so);
> -	}
> +	sounlock(so);
> +	sbunlock(so, &so->so_rcv);
>  
>  	if (fp)
>  		FRELE(fp, curproc);
> Index: sys/kern/uipc_socket2.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.149
> diff -u -p -r1.149 uipc_socket2.c
> --- sys/kern/uipc_socket2.c	11 Apr 2024 13:32:51 -0000	1.149
> +++ sys/kern/uipc_socket2.c	17 Apr 2024 15:17:53 -0000
> @@ -86,8 +86,10 @@ void
>  soisconnecting(struct socket *so)
>  {
>  	soassertlocked(so);
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
>  	so->so_state |= SS_ISCONNECTING;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  }
>  
>  void
> @@ -96,8 +98,10 @@ soisconnected(struct socket *so)
>  	struct socket *head = so->so_head;
>  
>  	soassertlocked(so);
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
>  	so->so_state |= SS_ISCONNECTED;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  
>  	if (head != NULL && so->so_onq == &head->so_q0) {
>  		int persocket = solock_persocket(so);
> @@ -140,9 +144,9 @@ void
>  soisdisconnecting(struct socket *so)
>  {
>  	soassertlocked(so);
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_state &= ~SS_ISCONNECTING;
>  	so->so_state |= SS_ISDISCONNECTING;
> -	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_rcv.sb_state |= SS_CANTRCVMORE;
>  	mtx_leave(&so->so_rcv.sb_mtx);
>  	so->so_snd.sb_state |= SS_CANTSENDMORE;
> @@ -155,9 +159,9 @@ void
>  soisdisconnected(struct socket *so)
>  {
>  	soassertlocked(so);
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
>  	so->so_state |= SS_ISDISCONNECTED;
> -	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_rcv.sb_state |= SS_CANTRCVMORE;
>  	mtx_leave(&so->so_rcv.sb_mtx);
>  	so->so_snd.sb_state |= SS_CANTSENDMORE;
> @@ -874,8 +878,7 @@ sbappend(struct socket *so, struct sockb
>  void
>  sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
>  {
> -	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -	soassertlocked(so);
> +	sbmtxassertlocked(so, sb);
>  	KDASSERT(m->m_nextpkt == NULL);
>  	KASSERT(sb->sb_mb == sb->sb_lastrecord);
>  
> Index: sys/netinet/tcp_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> retrieving revision 1.404
> diff -u -p -r1.404 tcp_input.c
> --- sys/netinet/tcp_input.c	13 Apr 2024 23:44:11 -0000	1.404
> +++ sys/netinet/tcp_input.c	17 Apr 2024 15:17:53 -0000
> @@ -337,10 +337,12 @@ tcp_flush_queue(struct tcpcb *tp)
>  		nq = TAILQ_NEXT(q, tcpqe_q);
>  		TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
>  		ND6_HINT(tp);
> +		mtx_enter(&so->so_rcv.sb_mtx);
>  		if (so->so_rcv.sb_state & SS_CANTRCVMORE)
>  			m_freem(q->tcpqe_m);
>  		else
>  			sbappendstream(so, &so->so_rcv, q->tcpqe_m);
> +		mtx_leave(&so->so_rcv.sb_mtx);
>  		pool_put(&tcpqe_pool, q);
>  		q = nq;
>  	} while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
> @@ -1018,8 +1020,13 @@ findpcb:
>  				return IPPROTO_DONE;
>  			}
>  		} else if (th->th_ack == tp->snd_una &&
> -		    TAILQ_EMPTY(&tp->t_segq) &&
> -		    tlen <= sbspace(so, &so->so_rcv)) {
> +		    TAILQ_EMPTY(&tp->t_segq)) {
> +			mtx_enter(&so->so_rcv.sb_mtx);
> +			if (tlen > sbspace(so, &so->so_rcv)) {
> +				mtx_leave(&so->so_rcv.sb_mtx);
> +				goto skip;
> +			}
> +
>  			/*
>  			 * This is a pure, in-sequence data packet
>  			 * with nothing on the reassembly queue and
> @@ -1057,6 +1064,7 @@ findpcb:
>  				m_adj(m, iphlen + off);
>  				sbappendstream(so, &so->so_rcv, m);
>  			}
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  			tp->t_flags |= TF_BLOCKOUTPUT;
>  			sorwakeup(so);
>  			tp->t_flags &= ~TF_BLOCKOUTPUT;
> @@ -1066,6 +1074,7 @@ findpcb:
>  			return IPPROTO_DONE;
>  		}
>  	}
> +skip:
>  
>  	/*
>  	 * Compute mbuf offset to TCP data segment.
> @@ -1081,7 +1090,10 @@ findpcb:
>  	{
>  		int win;
>  
> +		mtx_enter(&so->so_rcv.sb_mtx);
>  		win = sbspace(so, &so->so_rcv);
> +		mtx_leave(&so->so_rcv.sb_mtx);
> +
>  		if (win < 0)
>  			win = 0;
>  		tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
> @@ -1873,6 +1885,7 @@ step6:
>  	 */
>  	if ((tiflags & TH_URG) && th->th_urp &&
>  	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
> +		mtx_enter(&so->so_rcv.sb_mtx);
>  		/*
>  		 * This is a kludge, but if we receive and accept
>  		 * random urgent pointers, we'll crash in
> @@ -1880,6 +1893,7 @@ step6:
>  		 * actually wanting to send this much urgent data.
>  		 */
>  		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  			th->th_urp = 0;			/* XXX */
>  			tiflags &= ~TH_URG;		/* XXX */
>  			goto dodata;			/* XXX */
> @@ -1904,9 +1918,12 @@ step6:
>  			    (tp->rcv_up - tp->rcv_nxt) - 1;
>  			if (so->so_oobmark == 0)
>  				so->so_rcv.sb_state |= SS_RCVATMARK;
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  			sohasoutofband(so);
>  			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
> -		}
> +		} else
> +			mtx_leave(&so->so_rcv.sb_mtx);
> +
>  		/*
>  		 * Remove out of band data so doesn't get presented to user.
>  		 * This can happen independent of advancing the URG pointer,
> @@ -1946,12 +1963,14 @@ dodata:							/* XXX */
>  			tiflags = th->th_flags & TH_FIN;
>  			tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
>  			ND6_HINT(tp);
> +			mtx_enter(&so->so_rcv.sb_mtx);
>  			if (so->so_rcv.sb_state & SS_CANTRCVMORE)
>  				m_freem(m);
>  			else {
>  				m_adj(m, hdroptlen);
>  				sbappendstream(so, &so->so_rcv, m);
>  			}
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  			tp->t_flags |= TF_BLOCKOUTPUT;
>  			sorwakeup(so);
>  			tp->t_flags &= ~TF_BLOCKOUTPUT;
> @@ -3000,6 +3019,7 @@ tcp_mss_update(struct tcpcb *tp)
>  		(void)sbreserve(so, &so->so_snd, bufsize);
>  	}
>  
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	bufsize = so->so_rcv.sb_hiwat;
>  	if (bufsize > mss) {
>  		bufsize = roundup(bufsize, mss);
> @@ -3007,6 +3027,7 @@ tcp_mss_update(struct tcpcb *tp)
>  			bufsize = sb_max;
>  		(void)sbreserve(so, &so->so_rcv, bufsize);
>  	}
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  
>  }
>  
> @@ -3784,7 +3805,10 @@ syn_cache_add(struct sockaddr *src, stru
>  	/*
>  	 * Initialize some local state.
>  	 */
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	win = sbspace(so, &so->so_rcv);
> +	mtx_leave(&so->so_rcv.sb_mtx);
> +
>  	if (win > TCP_MAXWIN)
>  		win = TCP_MAXWIN;
>  
> Index: sys/netinet/tcp_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_output.c,v
> retrieving revision 1.143
> diff -u -p -r1.143 tcp_output.c
> --- sys/netinet/tcp_output.c	13 Feb 2024 12:22:09 -0000	1.143
> +++ sys/netinet/tcp_output.c	17 Apr 2024 15:17:53 -0000
> @@ -373,7 +373,9 @@ again:
>  	if (off + len < so->so_snd.sb_cc)
>  		flags &= ~TH_FIN;
>  
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	win = sbspace(so, &so->so_rcv);
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  
>  	/*
>  	 * Sender silly window avoidance.  If connection is idle
> Index: sys/netinet/tcp_subr.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_subr.c,v
> retrieving revision 1.200
> diff -u -p -r1.200 tcp_subr.c
> --- sys/netinet/tcp_subr.c	12 Apr 2024 16:07:09 -0000	1.200
> +++ sys/netinet/tcp_subr.c	17 Apr 2024 15:17:53 -0000
> @@ -311,7 +311,9 @@ tcp_respond(struct tcpcb *tp, caddr_t te
>  
>  	if (tp) {
>  		struct socket *so = tp->t_inpcb->inp_socket;
> +		mtx_enter(&so->so_rcv.sb_mtx);
>  		win = sbspace(so, &so->so_rcv);
> +		mtx_leave(&so->so_rcv.sb_mtx);
>  		/*
>  		 * If this is called with an unconnected
>  		 * socket/tp/pcb (tp->pf is 0), we lose.
> Index: sys/netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
> retrieving revision 1.231
> diff -u -p -r1.231 tcp_usrreq.c
> --- sys/netinet/tcp_usrreq.c	12 Apr 2024 16:07:09 -0000	1.231
> +++ sys/netinet/tcp_usrreq.c	17 Apr 2024 15:17:53 -0000
> @@ -907,13 +907,17 @@ tcp_rcvoob(struct socket *so, struct mbu
>  	if ((error = tcp_sogetpcb(so, &inp, &tp)))
>  		return (error);
>  
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	if ((so->so_oobmark == 0 &&
>  	    (so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
>  	    so->so_options & SO_OOBINLINE ||
>  	    tp->t_oobflags & TCPOOB_HADDATA) {
> +		mtx_leave(&so->so_rcv.sb_mtx);
>  		error = EINVAL;
>  		goto out;
>  	}
> +	mtx_leave(&so->so_rcv.sb_mtx);
> +
>  	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
>  		error = EWOULDBLOCK;
>  		goto out;
> @@ -1039,7 +1043,9 @@ tcp_dodisconnect(struct tcpcb *tp)
>  		tp = tcp_drop(tp, 0);
>  	else {
>  		soisdisconnecting(so);
> +		mtx_enter(&so->so_rcv.sb_mtx);
>  		sbflush(so, &so->so_rcv);
> +		mtx_leave(&so->so_rcv.sb_mtx);
>  		tp = tcp_usrclosed(tp);
>  		if (tp)
>  			(void) tcp_output(tp);