Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Turn sblock() to `sb_lock' rwlock(9) wrapper for all sockets
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Fri, 17 May 2024 19:20:08 +0200

Download raw body.

Thread
On Tue, May 07, 2024 at 11:23:17PM +0300, Vitaliy Makkoveev wrote:
> On Sat, May 04, 2024 at 02:57:00AM +0300, Vitaliy Makkoveev wrote:
> > sblock() is fine-grained lock which serializes socket send and receive
> > routines on `so_rcv' or `so_snd' buffers. There is no big problem to
> > wait netlock while holding sblock().
> > 
> > So unify sblock() behaviour to all sockets. Now it should be always
> > taken before solock() in all involved paths as sosend(), soreceive(),
> > sorflush() and sosplice() as already done for "unlocked" socket buffers.
> > 
> > This unification removes a lot of temporary "sb_flags & SB_MTXLOCK" code
> > from sockets layer. This unification makes straight "solock()" and
> > "sblock()" lock order, no more solock() -> sblock() -> sounlock() ->
> > solock() -> sbunlock() -> sounlock() chains in sosend() and soreceive()
> > paths. This unification brings witness(4) support for sblock(), include
> > NFS involved sockets, which is useful.
> > 
> > Since the witness(4) support was introduced to sblock() with this diff,
> > some new witness reports appeared. That's normal, it's the first time
> > they were exposed. In addition to [1] "lock order reversal in soreceive
> > and NFS" thread, I could say that `so_snd' also produces witness
> > reports during regress/sys/ffs/nfs run.
> > 
> 
> Updated against last tree changes. In addition, I removed unused `so'
> argument from sblock()/sbunlock().

OK bluhm@

> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> diff -u -p -r1.333 uipc_socket.c
> --- sys/kern/uipc_socket.c	3 May 2024 17:43:09 -0000	1.333
> +++ sys/kern/uipc_socket.c	7 May 2024 19:27:18 -0000
> @@ -66,7 +66,6 @@ void	soreaper(void *);
>  void	soput(void *);
>  int	somove(struct socket *, int);
>  void	sorflush(struct socket *);
> -void	sorflush_locked(struct socket *);
>  
>  void	filt_sordetach(struct knote *kn);
>  int	filt_soread(struct knote *kn, long hint);
> @@ -606,11 +605,11 @@ sosend(struct socket *so, struct mbuf *a
>  
>  #define	snderr(errno)	{ error = errno; goto release; }
>  
> -	if (dosolock)
> -		solock_shared(so);
>  restart:
> -	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
> +	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
>  		goto out;
> +	if (dosolock)
> +		solock_shared(so);
>  	sb_mtx_lock(&so->so_snd);
>  	so->so_snd.sb_state |= SS_ISSENDING;
>  	do {
> @@ -643,15 +642,12 @@ restart:
>  		    (atomic || space < so->so_snd.sb_lowat))) {
>  			if (flags & MSG_DONTWAIT)
>  				snderr(EWOULDBLOCK);
> -			sbunlock(so, &so->so_snd);
> -
> -			if (so->so_snd.sb_flags & SB_MTXLOCK)
> -				error = sbwait_locked(so, &so->so_snd);
> -			else
> -				error = sbwait(so, &so->so_snd);
> -
> +			sbunlock(&so->so_snd);
> +			error = sbwait(so, &so->so_snd);
>  			so->so_snd.sb_state &= ~SS_ISSENDING;
>  			sb_mtx_unlock(&so->so_snd);
> +			if (dosolock)
> +				sounlock_shared(so);
>  			if (error)
>  				goto out;
>  			goto restart;
> @@ -705,10 +701,10 @@ restart:
>  release:
>  	so->so_snd.sb_state &= ~SS_ISSENDING;
>  	sb_mtx_unlock(&so->so_snd);
> -	sbunlock(so, &so->so_snd);
> -out:
>  	if (dosolock)
>  		sounlock_shared(so);
> +	sbunlock(&so->so_snd);
> +out:
>  	m_freem(top);
>  	m_freem(control);
>  	return (error);
> @@ -875,11 +871,11 @@ bad:
>  	if (mp)
>  		*mp = NULL;
>  
> +restart:
> +	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
> +		return (error);
>  	if (dosolock)
>  		solock_shared(so);
> -restart:
> -	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
> -		goto out;
>  	sb_mtx_lock(&so->so_rcv);
>  
>  	m = so->so_rcv.sb_mb;
> @@ -944,25 +940,13 @@ restart:
>  		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
>  		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
>  
> -		if (so->so_rcv.sb_flags & SB_MTXLOCK) {
> -			sbunlock_locked(so, &so->so_rcv);
> -			if (dosolock)
> -				sounlock_shared(so);
> -			error = sbwait_locked(so, &so->so_rcv);
> -			sb_mtx_unlock(&so->so_rcv);
> -			if (error)
> -				return (error);
> -			if (dosolock)
> -				solock_shared(so);
> -		} else {
> -			sb_mtx_unlock(&so->so_rcv);
> -			sbunlock(so, &so->so_rcv);
> -			error = sbwait(so, &so->so_rcv);
> -			if (error) {
> -				sounlock_shared(so);
> -				return (error);
> -			}
> -		}
> +		sbunlock(&so->so_rcv);
> +		error = sbwait(so, &so->so_rcv);
> +		sb_mtx_unlock(&so->so_rcv);
> +		if (dosolock)
> +			sounlock_shared(so);
> +		if (error)
> +			return (error);
>  		goto restart;
>  	}
>  dontblock:
> @@ -1202,21 +1186,12 @@ dontblock:
>  				break;
>  			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
>  			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
> -			if (dosolock) {
> +			if (sbwait(so, &so->so_rcv)) {
>  				sb_mtx_unlock(&so->so_rcv);
> -				error = sbwait(so, &so->so_rcv);
> -				if (error) {
> -					sbunlock(so, &so->so_rcv);
> +				if (dosolock)
>  					sounlock_shared(so);
> -					return (0);
> -				}
> -				sb_mtx_lock(&so->so_rcv);
> -			} else {
> -				if (sbwait_locked(so, &so->so_rcv)) {
> -					sb_mtx_unlock(&so->so_rcv);
> -					sbunlock(so, &so->so_rcv);
> -					return (0);
> -				}
> +				sbunlock(&so->so_rcv);
> +				return (0);
>  			}
>  			if ((m = so->so_rcv.sb_mb) != NULL)
>  				nextrecord = m->m_nextpkt;
> @@ -1258,7 +1233,7 @@ dontblock:
>  	    (flags & MSG_EOR) == 0 &&
>  	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
>  		sb_mtx_unlock(&so->so_rcv);
> -		sbunlock(so, &so->so_rcv);
> +		sbunlock(&so->so_rcv);
>  		goto restart;
>  	}
>  
> @@ -1269,10 +1244,9 @@ dontblock:
>  		*flagsp |= flags;
>  release:
>  	sb_mtx_unlock(&so->so_rcv);
> -	sbunlock(so, &so->so_rcv);
> -out:
>  	if (dosolock)
>  		sounlock_shared(so);
> +	sbunlock(&so->so_rcv);
>  	return (error);
>  }
>  
> @@ -1302,48 +1276,33 @@ soshutdown(struct socket *so, int how)
>  }
>  
>  void
> -sorflush_locked(struct socket *so)
> +sorflush(struct socket *so)
>  {
>  	struct sockbuf *sb = &so->so_rcv;
>  	struct mbuf *m;
>  	const struct protosw *pr = so->so_proto;
>  	int error;
>  
> -	if ((sb->sb_flags & SB_MTXLOCK) == 0)
> -		soassertlocked(so);
> -
> -	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
> +	error = sblock(sb, SBL_WAIT | SBL_NOINTR);
>  	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
>  	KASSERT(error == 0);
>  
> -	if (sb->sb_flags & SB_MTXLOCK)
> -		solock(so);
> +	solock_shared(so);
>  	socantrcvmore(so);
> -	if (sb->sb_flags & SB_MTXLOCK)
> -		sounlock(so);
> -
>  	mtx_enter(&sb->sb_mtx);
>  	m = sb->sb_mb;
>  	memset(&sb->sb_startzero, 0,
>  	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
>  	sb->sb_timeo_nsecs = INFSLP;
>  	mtx_leave(&sb->sb_mtx);
> -	sbunlock(so, sb);
> +	sounlock_shared(so);
> +	sbunlock(sb);
> +
>  	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
>  		(*pr->pr_domain->dom_dispose)(m);
>  	m_purge(m);
>  }
>  
> -void
> -sorflush(struct socket *so)
> -{
> -	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
> -		solock_shared(so);
> -	sorflush_locked(so);
> -	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
> -		sounlock_shared(so);
> -}
> -
>  #ifdef SOCKET_SPLICE
>  
>  #define so_splicelen	so_sp->ssp_len
> @@ -1355,7 +1314,7 @@ sorflush(struct socket *so)
>  int
>  sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
>  {
> -	struct file	*fp = NULL;
> +	struct file	*fp;
>  	struct socket	*sosp;
>  	struct taskq	*tq;
>  	int		 error = 0;
> @@ -1367,6 +1326,29 @@ sosplice(struct socket *so, int fd, off_
>  	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
>  		return (EINVAL);
>  
> +	/* If no fd is given, unsplice by removing existing link. */
> +	if (fd < 0) {
> +		if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
> +			return (error);
> +		solock(so);
> +		if (so->so_options & SO_ACCEPTCONN) {
> +			error = EOPNOTSUPP;
> +			goto out;
> +		}
> +		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
> +		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
> +			error = ENOTCONN;
> +			goto out;
> +		}
> +
> +		if (so->so_sp && so->so_sp->ssp_socket)
> +			sounsplice(so, so->so_sp->ssp_socket, 0);
> + out:
> +		sounlock(so);
> +		sbunlock(&so->so_rcv);
> +		return (error);
> +	}
> +
>  	if (sosplice_taskq == NULL) {
>  		rw_enter_write(&sosplice_lock);
>  		if (sosplice_taskq == NULL) {
> @@ -1386,65 +1368,47 @@ sosplice(struct socket *so, int fd, off_
>  		membar_consumer();
>  	}
>  
> -	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
> -		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> -			return (error);
> -		solock(so);
> -	} else {
> -		solock(so);
> -		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> -			sounlock(so);
> -			return (error);
> -		}
> -	}
> -
> -	if (so->so_options & SO_ACCEPTCONN) {
> -		error = EOPNOTSUPP;
> -		goto out;
> -	}
> -	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
> -	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
> -		error = ENOTCONN;
> -		goto out;
> -	}
> -	if (so->so_sp == NULL)
> -		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> -
> -	/* If no fd is given, unsplice by removing existing link. */
> -	if (fd < 0) {
> -		if (so->so_sp->ssp_socket)
> -			sounsplice(so, so->so_sp->ssp_socket, 0);
> -		goto out;
> -	}
> -
>  	/* Find sosp, the drain socket where data will be spliced into. */
>  	if ((error = getsock(curproc, fd, &fp)) != 0)
> -		goto out;
> +		return (error);
>  	sosp = fp->f_data;
> +
>  	if (sosp->so_proto->pr_usrreqs->pru_send !=
>  	    so->so_proto->pr_usrreqs->pru_send) {
>  		error = EPROTONOSUPPORT;
> -		goto out;
> +		goto frele;
>  	}
> -	if (sosp->so_sp == NULL)
> -		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>  
> -	if ((error = sblock(sosp, &sosp->so_snd, SBL_WAIT)) != 0) {
> -		goto out;
> +	if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
> +		goto frele;
> +	if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) {
> +		sbunlock(&so->so_rcv);
> +		goto frele;
>  	}
> +	solock(so);
>  
> -	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
> -		error = EBUSY;
> +	if ((so->so_options & SO_ACCEPTCONN) ||
> +	    (sosp->so_options & SO_ACCEPTCONN)) {
> +		error = EOPNOTSUPP;
>  		goto release;
>  	}
> -	if (sosp->so_options & SO_ACCEPTCONN) {
> -		error = EOPNOTSUPP;
> +	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
> +	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
> +		error = ENOTCONN;
>  		goto release;
>  	}
>  	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
>  		error = ENOTCONN;
>  		goto release;
>  	}
> +	if (so->so_sp == NULL)
> +		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> +	if (sosp->so_sp == NULL)
> +		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> +	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
> +		error = EBUSY;
> +		goto release;
> +	}
>  
>  	/* Splice so and sosp together. */
>  	mtx_enter(&so->so_rcv.sb_mtx);
> @@ -1472,18 +1436,11 @@ sosplice(struct socket *so, int fd, off_
>  	}
>  
>   release:
> -	sbunlock(sosp, &sosp->so_snd);
> - out:
> -	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
> -		sounlock(so);
> -		sbunlock(so, &so->so_rcv);
> -	} else {
> -		sbunlock(so, &so->so_rcv);
> -		sounlock(so);
> -	}
> -
> -	if (fp)
> -		FRELE(fp, curproc);
> +	sounlock(so);
> +	sbunlock(&sosp->so_snd);
> +	sbunlock(&so->so_rcv);
> + frele:
> +	FRELE(fp, curproc);
>  
>  	return (error);
>  }
> Index: sys/kern/uipc_socket2.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> diff -u -p -r1.154 uipc_socket2.c
> --- sys/kern/uipc_socket2.c	7 May 2024 15:54:23 -0000	1.154
> +++ sys/kern/uipc_socket2.c	7 May 2024 19:27:19 -0000
> @@ -512,23 +512,19 @@ sbmtxassertlocked(struct socket *so, str
>   * Wait for data to arrive at/drain from a socket buffer.
>   */
>  int
> -sbwait_locked(struct socket *so, struct sockbuf *sb)
> -{
> -	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
> -
> -	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> -
> -	sb->sb_flags |= SB_WAIT;
> -	return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
> -	    sb->sb_timeo_nsecs);
> -}
> -
> -int
>  sbwait(struct socket *so, struct sockbuf *sb)
>  {
>  	uint64_t timeo_nsecs;
>  	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
>  
> +	if (sb->sb_flags & SB_MTXLOCK) {
> +		MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> +
> +		sb->sb_flags |= SB_WAIT;
> +		return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
> +		    sb->sb_timeo_nsecs);
> +	}
> +
>  	soassertlocked(so);
>  
>  	mtx_enter(&sb->sb_mtx);
> @@ -540,81 +536,26 @@ sbwait(struct socket *so, struct sockbuf
>  }
>  
>  int
> -sblock(struct socket *so, struct sockbuf *sb, int flags)
> +sblock(struct sockbuf *sb, int flags)
>  {
> -	int error = 0, prio = PSOCK;
> -
> -	if (sb->sb_flags & SB_MTXLOCK) {
> -		int rwflags = RW_WRITE;
> -
> -		if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
> -			rwflags |= RW_INTR;
> -		if (!(flags & SBL_WAIT))
> -			rwflags |= RW_NOSLEEP;
> -
> -		error = rw_enter(&sb->sb_lock, rwflags);
> -		if (error == EBUSY)
> -			error = EWOULDBLOCK;
> -		return error;
> -	}
> +	int rwflags = RW_WRITE, error;
>  
> -	soassertlocked(so);
> -
> -	mtx_enter(&sb->sb_mtx);
> -	if ((sb->sb_flags & SB_LOCK) == 0) {
> -		sb->sb_flags |= SB_LOCK;
> -		goto out;
> -	}
> -	if ((flags & SBL_WAIT) == 0) {
> -		error = EWOULDBLOCK;
> -		goto out;
> -	}
>  	if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
> -		prio |= PCATCH;
> -
> -	while (sb->sb_flags & SB_LOCK) {
> -		sb->sb_flags |= SB_WANT;
> -		mtx_leave(&sb->sb_mtx);
> -		error = sosleep_nsec(so, &sb->sb_flags, prio, "sblock", INFSLP);
> -		if (error)
> -			return (error);
> -		mtx_enter(&sb->sb_mtx);
> -	}
> -	sb->sb_flags |= SB_LOCK;
> -out:
> -	mtx_leave(&sb->sb_mtx);
> -
> -	return (error);
> -}
> -
> -void
> -sbunlock_locked(struct socket *so, struct sockbuf *sb)
> -{
> -	if (sb->sb_flags & SB_MTXLOCK) {
> -		rw_exit(&sb->sb_lock);
> -		return;
> -	}
> +		rwflags |= RW_INTR;
> +	if (!(flags & SBL_WAIT))
> +		rwflags |= RW_NOSLEEP;
>  
> -	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> +	error = rw_enter(&sb->sb_lock, rwflags);
> +	if (error == EBUSY)
> +		error = EWOULDBLOCK;
>  
> -	sb->sb_flags &= ~SB_LOCK;
> -	if (sb->sb_flags & SB_WANT) {
> -		sb->sb_flags &= ~SB_WANT;
> -		wakeup(&sb->sb_flags);
> -	}
> +	return error;
>  }
>  
>  void
> -sbunlock(struct socket *so, struct sockbuf *sb)
> +sbunlock(struct sockbuf *sb)
>  {
> -	if (sb->sb_flags & SB_MTXLOCK) {
> -		rw_exit(&sb->sb_lock);
> -		return;
> -	}
> -
> -	mtx_enter(&sb->sb_mtx);
> -	sbunlock_locked(so, sb);
> -	mtx_leave(&sb->sb_mtx);
> +	rw_exit(&sb->sb_lock);
>  }
>  
>  /*
> @@ -1128,7 +1069,7 @@ void
>  sbflush(struct socket *so, struct sockbuf *sb)
>  {
>  	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -	KASSERT((sb->sb_flags & SB_LOCK) == 0);
> +	rw_assert_unlocked(&sb->sb_lock);
>  
>  	while (sb->sb_mbcnt)
>  		sbdrop(so, sb, (int)sb->sb_cc);
> Index: sys/sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> diff -u -p -r1.130 socketvar.h
> --- sys/sys/socketvar.h	3 May 2024 17:43:09 -0000	1.130
> +++ sys/sys/socketvar.h	7 May 2024 19:27:20 -0000
> @@ -128,13 +128,11 @@ struct socket {
>  		struct klist sb_klist;	/* process selecting read/write */
>  	} so_rcv, so_snd;
>  #define SB_MAX		(2*1024*1024)	/* default for max chars in sockbuf */
> -#define SB_LOCK		0x0001		/* lock on data queue */
> -#define SB_WANT		0x0002		/* someone is waiting to lock */
> -#define SB_WAIT		0x0004		/* someone is waiting for data/space */
> -#define SB_ASYNC	0x0010		/* ASYNC I/O, need signals */
> -#define SB_SPLICE	0x0020		/* buffer is splice source or drain */
> -#define SB_NOINTR	0x0040		/* operations not interruptible */
> -#define SB_MTXLOCK	0x0080		/* sblock() doesn't need solock() */
> +#define SB_WAIT		0x0001		/* someone is waiting for data/space */
> +#define SB_ASYNC	0x0002		/* ASYNC I/O, need signals */
> +#define SB_SPLICE	0x0004		/* buffer is splice source or drain */
> +#define SB_NOINTR	0x0008		/* operations not interruptible */
> +#define SB_MTXLOCK	0x0010		/* sblock() doesn't need solock() */
>  
>  	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
>  	caddr_t	so_upcallarg;		/* Arg for above */
> @@ -315,11 +313,10 @@ sbfree(struct socket *so, struct sockbuf
>   * sleep is interruptible. Returns error without lock if
>   * sleep is interrupted.
>   */
> -int sblock(struct socket *, struct sockbuf *, int);
> +int sblock(struct sockbuf *, int);
>  
>  /* release lock on sockbuf sb */
> -void sbunlock(struct socket *, struct sockbuf *);
> -void sbunlock_locked(struct socket *, struct sockbuf *);
> +void sbunlock(struct sockbuf *);
>  
>  #define	SB_EMPTY_FIXUP(sb) do {						\
>  	if ((sb)->sb_mb == NULL) {					\
> @@ -367,7 +364,6 @@ int	sbcheckreserve(u_long, u_long);
>  int	sbchecklowmem(void);
>  int	sbreserve(struct socket *, struct sockbuf *, u_long);
>  int	sbwait(struct socket *, struct sockbuf *);
> -int	sbwait_locked(struct socket *, struct sockbuf *);
>  void	soinit(void);
>  void	soabort(struct socket *);
>  int	soaccept(struct socket *, struct mbuf *);