Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Use `sb_mtx' to protect `so_rcv' receive buffer of unix(4) sockets
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Tue, 26 Mar 2024 00:40:00 +0100

Download raw body.

Thread
On Sat, Mar 23, 2024 at 10:00:35PM +0300, Vitaliy Makkoveev wrote:
> This makes re-locking unnecessary in the uipc_*send() paths, because
> it's enough to lock one socket to prevent peer from concurrent
> disconnection. As the little bonus unix(4) one socket can perform
> simultaneous transmission and reception with one exception for
> uipc_rcvd() which still requires the re-lock for connection oriented
> sockets.
> 
> A few words about filt_soread() and filt_soexcept(). The socket lock is
> not held while they called from uipc_*send() through sorwakeup().
> However, the unlocked access to the `so_options', `so_state' and
> `so_error' is fine.
> 
> In the uipc_*send() paths the socket lock is still held on sending
> socket, so the receiving peer can't be disconnected. It also can't be or
> became listening socket, so SO_ACCEPTCONN is not set and can't be set
> concurrently as the SS_ISDISCONNECTED bit. The SS_ISCONNECTED bit is
> always set and can't be cleared concurrently. `so_error' is set on the
> peer sockets only by unp_detach(), which also can't be called
> concurrently on sending socket.
> 
> This is also true for filt_fiforead() and filt_fifoexcept(). For other
> callers like kevent(2) or doaccept() the socket lock is still held.  

Passed regress with witness on i386.

OK bluhm@

> Index: sys/kern/sys_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sys_socket.c,v
> retrieving revision 1.61
> diff -u -p -r1.61 sys_socket.c
> --- sys/kern/sys_socket.c	15 Apr 2023 13:18:28 -0000	1.61
> +++ sys/kern/sys_socket.c	23 Mar 2024 18:59:11 -0000
> @@ -144,9 +144,11 @@ soo_stat(struct file *fp, struct stat *u
>  	memset(ub, 0, sizeof (*ub));
>  	ub->st_mode = S_IFSOCK;
>  	solock(so);
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	if ((so->so_rcv.sb_state & SS_CANTRCVMORE) == 0 ||
>  	    so->so_rcv.sb_cc != 0)
>  		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  	if ((so->so_snd.sb_state & SS_CANTSENDMORE) == 0)
>  		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
>  	ub->st_uid = so->so_euid;
> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.321
> diff -u -p -r1.321 uipc_socket.c
> --- sys/kern/uipc_socket.c	22 Mar 2024 17:34:11 -0000	1.321
> +++ sys/kern/uipc_socket.c	23 Mar 2024 18:59:11 -0000
> @@ -160,6 +160,9 @@ soalloc(const struct protosw *prp, int w
>  			break;
>  		}
>  		break;
> +	case AF_UNIX:
> +		so->so_rcv.sb_flags |= SB_MTXLOCK;
> +		break;
>  	}
>  
>  	return (so);
> @@ -987,8 +990,11 @@ dontblock:
>  				 * Dispose of any SCM_RIGHTS message that went
>  				 * through the read path rather than recv.
>  				 */
> -				if (pr->pr_domain->dom_dispose)
> +				if (pr->pr_domain->dom_dispose) {
> +					sb_mtx_unlock(&so->so_rcv);
>  					pr->pr_domain->dom_dispose(cm);
> +					sb_mtx_lock(&so->so_rcv);
> +				}
>  				m_free(cm);
>  			}
>  		}
> @@ -1173,8 +1179,11 @@ dontblock:
>  		}
>  		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
>  		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
> -		if (pr->pr_flags & PR_WANTRCVD)
> +		if (pr->pr_flags & PR_WANTRCVD) {
> +			sb_mtx_unlock(&so->so_rcv);
>  			pru_rcvd(so);
> +			sb_mtx_lock(&so->so_rcv);
> +		}
>  	}
>  	if (orig_resid == uio->uio_resid && orig_resid &&
>  	    (flags & MSG_EOR) == 0 &&
> @@ -1233,10 +1242,10 @@ sorflush(struct socket *so)
>  	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
>  	KASSERT(error == 0);
>  	socantrcvmore(so);
> +	mtx_enter(&sb->sb_mtx);
>  	m = sb->sb_mb;
>  	memset(&sb->sb_startzero, 0,
>  	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
> -	mtx_enter(&sb->sb_mtx);
>  	sb->sb_timeo_nsecs = INFSLP;
>  	mtx_leave(&sb->sb_mtx);
>  	sbunlock(so, sb);
> @@ -1757,7 +1766,8 @@ somove(struct socket *so, int wait)
>  void
>  sorwakeup(struct socket *so)
>  {
> -	soassertlocked_readonly(so);
> +	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
> +		soassertlocked_readonly(so);
>  
>  #ifdef SOCKET_SPLICE
>  	if (so->so_rcv.sb_flags & SB_SPLICE) {
> @@ -1877,6 +1887,8 @@ sosetopt(struct socket *so, int level, i
>  				cnt = 1;
>  
>  			solock(so);
> +			mtx_enter(&sb->sb_mtx);
> +
>  			switch (optname) {
>  			case SO_SNDBUF:
>  			case SO_RCVBUF:
> @@ -1898,7 +1910,10 @@ sosetopt(struct socket *so, int level, i
>  				    sb->sb_hiwat : cnt;
>  				break;
>  			}
> +
> +			mtx_leave(&sb->sb_mtx);
>  			sounlock(so);
> +
>  			break;
>  		    }
>  
> @@ -2169,13 +2184,6 @@ sofilt_unlock(struct socket *so, struct 
>  	}
>  }
>  
> -static inline void
> -sofilt_assert_locked(struct socket *so, struct sockbuf *sb)
> -{
> -	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> -	soassertlocked_readonly(so);
> -}
> -
>  int
>  soo_kqfilter(struct file *fp, struct knote *kn)
>  {
> @@ -2218,9 +2226,14 @@ filt_soread(struct knote *kn, long hint)
>  	struct socket *so = kn->kn_fp->f_data;
>  	int rv = 0;
>  
> -	sofilt_assert_locked(so, &so->so_rcv);
> +	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
> +	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
> +		soassertlocked_readonly(so);
>  
>  	if (so->so_options & SO_ACCEPTCONN) {
> +		if (so->so_rcv.sb_flags & SB_MTXLOCK)
> +			soassertlocked_readonly(so);
> +
>  		kn->kn_data = so->so_qlen;
>  		rv = (kn->kn_data != 0);
>  
> @@ -2275,7 +2288,8 @@ filt_sowrite(struct knote *kn, long hint
>  	struct socket *so = kn->kn_fp->f_data;
>  	int rv;
>  
> -	sofilt_assert_locked(so, &so->so_snd);
> +	MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
> +	soassertlocked_readonly(so);
>  
>  	kn->kn_data = sbspace(so, &so->so_snd);
>  	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
> @@ -2306,7 +2320,9 @@ filt_soexcept(struct knote *kn, long hin
>  	struct socket *so = kn->kn_fp->f_data;
>  	int rv = 0;
>  
> -	sofilt_assert_locked(so, &so->so_rcv);
> +	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
> +	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
> +		soassertlocked_readonly(so);
>  
>  #ifdef SOCKET_SPLICE
>  	if (isspliced(so)) {
> Index: sys/kern/uipc_socket2.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.144
> diff -u -p -r1.144 uipc_socket2.c
> --- sys/kern/uipc_socket2.c	12 Feb 2024 22:48:27 -0000	1.144
> +++ sys/kern/uipc_socket2.c	23 Mar 2024 18:59:11 -0000
> @@ -142,7 +142,9 @@ soisdisconnecting(struct socket *so)
>  	soassertlocked(so);
>  	so->so_state &= ~SS_ISCONNECTING;
>  	so->so_state |= SS_ISDISCONNECTING;
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_rcv.sb_state |= SS_CANTRCVMORE;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  	so->so_snd.sb_state |= SS_CANTSENDMORE;
>  	wakeup(&so->so_timeo);
>  	sowwakeup(so);
> @@ -155,7 +157,9 @@ soisdisconnected(struct socket *so)
>  	soassertlocked(so);
>  	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
>  	so->so_state |= SS_ISDISCONNECTED;
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_rcv.sb_state |= SS_CANTRCVMORE;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  	so->so_snd.sb_state |= SS_CANTSENDMORE;
>  	wakeup(&so->so_timeo);
>  	sowwakeup(so);
> @@ -219,9 +223,10 @@ sonewconn(struct socket *head, int conns
>  	mtx_enter(&head->so_snd.sb_mtx);
>  	so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
>  	mtx_leave(&head->so_snd.sb_mtx);
> +
> +	mtx_enter(&head->so_rcv.sb_mtx);
>  	so->so_rcv.sb_wat = head->so_rcv.sb_wat;
>  	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
> -	mtx_enter(&head->so_rcv.sb_mtx);
>  	so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
>  	mtx_leave(&head->so_rcv.sb_mtx);
>  
> @@ -651,16 +656,22 @@ soreserve(struct socket *so, u_long sndc
>  
>  	if (sbreserve(so, &so->so_snd, sndcc))
>  		goto bad;
> -	if (sbreserve(so, &so->so_rcv, rcvcc))
> -		goto bad2;
>  	so->so_snd.sb_wat = sndcc;
> -	so->so_rcv.sb_wat = rcvcc;
> -	if (so->so_rcv.sb_lowat == 0)
> -		so->so_rcv.sb_lowat = 1;
>  	if (so->so_snd.sb_lowat == 0)
>  		so->so_snd.sb_lowat = MCLBYTES;
>  	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
>  		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
> +
> +	mtx_enter(&so->so_rcv.sb_mtx);
> +	if (sbreserve(so, &so->so_rcv, rcvcc)) {
> +		mtx_leave(&so->so_rcv.sb_mtx);
> +		goto bad2;
> +	}
> +	so->so_rcv.sb_wat = rcvcc;
> +	if (so->so_rcv.sb_lowat == 0)
> +		so->so_rcv.sb_lowat = 1;
> +	mtx_leave(&so->so_rcv.sb_mtx);
> +
>  	return (0);
>  bad2:
>  	sbrelease(so, &so->so_snd);
> @@ -676,8 +687,7 @@ bad:
>  int
>  sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
>  {
> -	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -	soassertlocked(so);
> +	sbmtxassertlocked(so, sb);
>  
>  	if (cc == 0 || cc > sb_max)
>  		return (1);
> @@ -818,7 +828,7 @@ sbappend(struct socket *so, struct sockb
>  	if (m == NULL)
>  		return;
>  
> -	soassertlocked(so);
> +	sbmtxassertlocked(so, sb);
>  	SBLASTRECORDCHK(sb, "sbappend 1");
>  
>  	if ((n = sb->sb_lastrecord) != NULL) {
> @@ -899,8 +909,7 @@ sbappendrecord(struct socket *so, struct
>  {
>  	struct mbuf *m;
>  
> -	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -	soassertlocked(so);
> +	sbmtxassertlocked(so, sb);
>  
>  	if (m0 == NULL)
>  		return;
> @@ -984,6 +993,8 @@ sbappendcontrol(struct socket *so, struc
>  	struct mbuf *m, *mlast, *n;
>  	int eor = 0, space = 0;
>  
> +	sbmtxassertlocked(so, sb);
> +
>  	if (control == NULL)
>  		panic("sbappendcontrol");
>  	for (m = control; ; m = m->m_next) {
> @@ -1109,8 +1120,7 @@ sbdrop(struct socket *so, struct sockbuf
>  	struct mbuf *m, *mn;
>  	struct mbuf *next;
>  
> -	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -	soassertlocked(so);
> +	sbmtxassertlocked(so, sb);
>  
>  	next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
>  	while (len > 0) {
> Index: sys/kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.202
> diff -u -p -r1.202 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c	22 Mar 2024 17:34:11 -0000	1.202
> +++ sys/kern/uipc_usrreq.c	23 Mar 2024 18:59:11 -0000
> @@ -489,8 +489,10 @@ uipc_rcvd(struct socket *so)
>  	 * Adjust backpressure on sender
>  	 * and wakeup any waiting to write.
>  	 */
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
>  	so2->so_snd.sb_cc = so->so_rcv.sb_cc;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  	sowwakeup(so2);
>  	sounlock(so2);
>  }
> @@ -499,8 +501,9 @@ int
>  uipc_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
>      struct mbuf *control)
>  {
> +	struct unpcb *unp = sotounpcb(so);
>  	struct socket *so2;
> -	int error = 0;
> +	int error = 0, dowakeup = 0;
>  
>  	if (control) {
>  		sounlock(so);
> @@ -514,21 +517,24 @@ uipc_send(struct socket *so, struct mbuf
>  		error = EPIPE;
>  		goto dispose;
>  	}
> -	if ((so2 = unp_solock_peer(so)) == NULL) {
> +	if (unp->unp_conn == NULL) {
>  		error = ENOTCONN;
>  		goto dispose;
>  	}
>  
> +	so2 = unp->unp_conn->unp_socket;
> +
>  	/*
>  	 * Send to paired receive port, and then raise
>  	 * send buffer counts to maintain backpressure.
>  	 * Wake up readers.
>  	 */
> +	mtx_enter(&so2->so_rcv.sb_mtx);
>  	if (control) {
>  		if (sbappendcontrol(so2, &so2->so_rcv, m, control)) {
>  			control = NULL;
>  		} else {
> -			sounlock(so2);
> +			mtx_leave(&so2->so_rcv.sb_mtx);
>  			error = ENOBUFS;
>  			goto dispose;
>  		}
> @@ -539,9 +545,12 @@ uipc_send(struct socket *so, struct mbuf
>  	so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
>  	so->so_snd.sb_cc = so2->so_rcv.sb_cc;
>  	if (so2->so_rcv.sb_cc > 0)
> +		dowakeup = 1;
> +	mtx_leave(&so2->so_rcv.sb_mtx);
> +
> +	if (dowakeup)
>  		sorwakeup(so2);
>  
> -	sounlock(so2);
>  	m = NULL;
>  
>  dispose:
> @@ -563,7 +572,7 @@ uipc_dgram_send(struct socket *so, struc
>  	struct unpcb *unp = sotounpcb(so);
>  	struct socket *so2;
>  	const struct sockaddr *from;
> -	int error = 0;
> +	int error = 0, dowakeup = 0;
>  
>  	if (control) {
>  		sounlock(so);
> @@ -583,7 +592,7 @@ uipc_dgram_send(struct socket *so, struc
>  			goto dispose;
>  	}
>  
> -	if ((so2 = unp_solock_peer(so)) == NULL) {
> +	if (unp->unp_conn == NULL) {
>  		if (nam != NULL)
>  			error = ECONNREFUSED;
>  		else
> @@ -591,20 +600,24 @@ uipc_dgram_send(struct socket *so, struc
>  		goto dispose;
>  	}
>  
> +	so2 = unp->unp_conn->unp_socket;
> +
>  	if (unp->unp_addr)
>  		from = mtod(unp->unp_addr, struct sockaddr *);
>  	else
>  		from = &sun_noname;
> +
> +	mtx_enter(&so2->so_rcv.sb_mtx);
>  	if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
> -		sorwakeup(so2);
> +		dowakeup = 1;
>  		m = NULL;
>  		control = NULL;
>  	} else
>  		error = ENOBUFS;
> +	mtx_leave(&so2->so_rcv.sb_mtx);
>  
> -	if (so2 != so)
> -		sounlock(so2);
> -
> +	if (dowakeup)
> +		sorwakeup(so2);
>  	if (nam)
>  		unp_disconnect(unp);
>  
> @@ -1390,9 +1403,9 @@ unp_gc(void *arg __unused)
>  		if ((unp->unp_gcflags & UNP_GCDEAD) == 0)
>  			continue;
>  		so = unp->unp_socket;
> -		solock(so);
> +		mtx_enter(&so->so_rcv.sb_mtx);
>  		unp_scan(so->so_rcv.sb_mb, unp_remove_gcrefs);
> -		sounlock(so);
> +		mtx_leave(&so->so_rcv.sb_mtx);
>  	}
>  
>  	/*
> @@ -1414,9 +1427,9 @@ unp_gc(void *arg __unused)
>  			unp->unp_gcflags &= ~UNP_GCDEAD;
>  
>  			so = unp->unp_socket;
> -			solock(so);
> +			mtx_enter(&so->so_rcv.sb_mtx);
>  			unp_scan(so->so_rcv.sb_mb, unp_restore_gcrefs);
> -			sounlock(so);
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  
>  			KASSERT(nunref > 0);
>  			nunref--;
> Index: sys/miscfs/fifofs/fifo_vnops.c
> ===================================================================
> RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
> retrieving revision 1.103
> diff -u -p -r1.103 fifo_vnops.c
> --- sys/miscfs/fifofs/fifo_vnops.c	3 Feb 2024 22:50:09 -0000	1.103
> +++ sys/miscfs/fifofs/fifo_vnops.c	23 Mar 2024 18:59:11 -0000
> @@ -201,7 +201,9 @@ fifo_open(void *v)
>  		if (fip->fi_writers == 1) {
>  			solock(rso);
>  			rso->so_state &= ~SS_ISDISCONNECTED;
> +			mtx_enter(&rso->so_rcv.sb_mtx);
>  			rso->so_rcv.sb_state &= ~SS_CANTRCVMORE;
> +			mtx_leave(&rso->so_rcv.sb_mtx);
>  			sounlock(rso);
>  			if (fip->fi_readers > 0)
>  				wakeup(&fip->fi_readers);
> @@ -518,7 +520,6 @@ filt_fiforead(struct knote *kn, long hin
>  	struct socket *so = kn->kn_hook;
>  	int rv;
>  
> -	soassertlocked(so);
>  	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
>  
>  	kn->kn_data = so->so_rcv.sb_cc;
> @@ -574,7 +575,6 @@ filt_fifoexcept(struct knote *kn, long h
>  	struct socket *so = kn->kn_hook;
>  	int rv = 0;
>  
> -	soassertlocked(so);
>  	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
>  
>  	if (kn->kn_flags & __EV_POLL) {
> Index: sys/sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.125
> diff -u -p -r1.125 socketvar.h
> --- sys/sys/socketvar.h	22 Mar 2024 17:34:11 -0000	1.125
> +++ sys/sys/socketvar.h	23 Mar 2024 18:59:11 -0000
> @@ -242,7 +242,10 @@ sb_notify(struct socket *so, struct sock
>  static inline long
>  sbspace(struct socket *so, struct sockbuf *sb)
>  {
> -	soassertlocked_readonly(so);
> +	if (sb->sb_flags & SB_MTXLOCK)
> +		sbmtxassertlocked(so, sb);
> +	else
> +		soassertlocked_readonly(so);
>  
>  	return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
>  }