Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: Turn sblock() to `sb_lock' rwlock(9) wrapper for all sockets
To:
tech@openbsd.org
Date:
Tue, 7 May 2024 23:23:17 +0300

Download raw body.

Thread
On Sat, May 04, 2024 at 02:57:00AM +0300, Vitaliy Makkoveev wrote:
> sblock() is fine-grained lock which serializes socket send and receive
> routines on `so_rcv' or `so_snd' buffers. There is no big problem to
> wait netlock while holding sblock().
> 
> So unify sblock() behaviour to all sockets. Now it should be always
> taken before solock() in all involved paths as sosend(), soreceive(),
> sorflush() and sosplice() as already done for "unlocked" socket buffers.
> 
> This unification removes a lot of temporary "sb_flags & SB_MTXLOCK" code
> from sockets layer. This unification makes straight "solock()" and
> "sblock()" lock order, no more solock() -> sblock() -> sounlock() ->
> solock() -> sbunlock() -> sounlock() chains in sosend() and soreceive()
> paths. This unification brings witness(4) support for sblock(), include
> NFS involved sockets, which is useful.
> 
> Since the witness(4) support was introduced to sblock() with this diff,
> some new witness reports appeared. That's normal, it's the first time
> they were exposed. In addition to [1] "lock order reversal in soreceive
> and NFS" thread, I could say that `so_snd' also produces witness
> reports during regress/sys/ffs/nfs run.
> 

Updated against last tree changes. In addition, I removed unused `so'
argument from sblock()/sbunlock().

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.333 uipc_socket.c
--- sys/kern/uipc_socket.c	3 May 2024 17:43:09 -0000	1.333
+++ sys/kern/uipc_socket.c	7 May 2024 19:27:18 -0000
@@ -66,7 +66,6 @@ void	soreaper(void *);
 void	soput(void *);
 int	somove(struct socket *, int);
 void	sorflush(struct socket *);
-void	sorflush_locked(struct socket *);
 
 void	filt_sordetach(struct knote *kn);
 int	filt_soread(struct knote *kn, long hint);
@@ -606,11 +605,11 @@ sosend(struct socket *so, struct mbuf *a
 
 #define	snderr(errno)	{ error = errno; goto release; }
 
-	if (dosolock)
-		solock_shared(so);
 restart:
-	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
+	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
 		goto out;
+	if (dosolock)
+		solock_shared(so);
 	sb_mtx_lock(&so->so_snd);
 	so->so_snd.sb_state |= SS_ISSENDING;
 	do {
@@ -643,15 +642,12 @@ restart:
 		    (atomic || space < so->so_snd.sb_lowat))) {
 			if (flags & MSG_DONTWAIT)
 				snderr(EWOULDBLOCK);
-			sbunlock(so, &so->so_snd);
-
-			if (so->so_snd.sb_flags & SB_MTXLOCK)
-				error = sbwait_locked(so, &so->so_snd);
-			else
-				error = sbwait(so, &so->so_snd);
-
+			sbunlock(&so->so_snd);
+			error = sbwait(so, &so->so_snd);
 			so->so_snd.sb_state &= ~SS_ISSENDING;
 			sb_mtx_unlock(&so->so_snd);
+			if (dosolock)
+				sounlock_shared(so);
 			if (error)
 				goto out;
 			goto restart;
@@ -705,10 +701,10 @@ restart:
 release:
 	so->so_snd.sb_state &= ~SS_ISSENDING;
 	sb_mtx_unlock(&so->so_snd);
-	sbunlock(so, &so->so_snd);
-out:
 	if (dosolock)
 		sounlock_shared(so);
+	sbunlock(&so->so_snd);
+out:
 	m_freem(top);
 	m_freem(control);
 	return (error);
@@ -875,11 +871,11 @@ bad:
 	if (mp)
 		*mp = NULL;
 
+restart:
+	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
+		return (error);
 	if (dosolock)
 		solock_shared(so);
-restart:
-	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
-		goto out;
 	sb_mtx_lock(&so->so_rcv);
 
 	m = so->so_rcv.sb_mb;
@@ -944,25 +940,13 @@ restart:
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 
-		if (so->so_rcv.sb_flags & SB_MTXLOCK) {
-			sbunlock_locked(so, &so->so_rcv);
-			if (dosolock)
-				sounlock_shared(so);
-			error = sbwait_locked(so, &so->so_rcv);
-			sb_mtx_unlock(&so->so_rcv);
-			if (error)
-				return (error);
-			if (dosolock)
-				solock_shared(so);
-		} else {
-			sb_mtx_unlock(&so->so_rcv);
-			sbunlock(so, &so->so_rcv);
-			error = sbwait(so, &so->so_rcv);
-			if (error) {
-				sounlock_shared(so);
-				return (error);
-			}
-		}
+		sbunlock(&so->so_rcv);
+		error = sbwait(so, &so->so_rcv);
+		sb_mtx_unlock(&so->so_rcv);
+		if (dosolock)
+			sounlock_shared(so);
+		if (error)
+			return (error);
 		goto restart;
 	}
 dontblock:
@@ -1202,21 +1186,12 @@ dontblock:
 				break;
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
-			if (dosolock) {
+			if (sbwait(so, &so->so_rcv)) {
 				sb_mtx_unlock(&so->so_rcv);
-				error = sbwait(so, &so->so_rcv);
-				if (error) {
-					sbunlock(so, &so->so_rcv);
+				if (dosolock)
 					sounlock_shared(so);
-					return (0);
-				}
-				sb_mtx_lock(&so->so_rcv);
-			} else {
-				if (sbwait_locked(so, &so->so_rcv)) {
-					sb_mtx_unlock(&so->so_rcv);
-					sbunlock(so, &so->so_rcv);
-					return (0);
-				}
+				sbunlock(&so->so_rcv);
+				return (0);
 			}
 			if ((m = so->so_rcv.sb_mb) != NULL)
 				nextrecord = m->m_nextpkt;
@@ -1258,7 +1233,7 @@ dontblock:
 	    (flags & MSG_EOR) == 0 &&
 	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
 		sb_mtx_unlock(&so->so_rcv);
-		sbunlock(so, &so->so_rcv);
+		sbunlock(&so->so_rcv);
 		goto restart;
 	}
 
@@ -1269,10 +1244,9 @@ dontblock:
 		*flagsp |= flags;
 release:
 	sb_mtx_unlock(&so->so_rcv);
-	sbunlock(so, &so->so_rcv);
-out:
 	if (dosolock)
 		sounlock_shared(so);
+	sbunlock(&so->so_rcv);
 	return (error);
 }
 
@@ -1302,48 +1276,33 @@ soshutdown(struct socket *so, int how)
 }
 
 void
-sorflush_locked(struct socket *so)
+sorflush(struct socket *so)
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct mbuf *m;
 	const struct protosw *pr = so->so_proto;
 	int error;
 
-	if ((sb->sb_flags & SB_MTXLOCK) == 0)
-		soassertlocked(so);
-
-	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
+	error = sblock(sb, SBL_WAIT | SBL_NOINTR);
 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
 	KASSERT(error == 0);
 
-	if (sb->sb_flags & SB_MTXLOCK)
-		solock(so);
+	solock_shared(so);
 	socantrcvmore(so);
-	if (sb->sb_flags & SB_MTXLOCK)
-		sounlock(so);
-
 	mtx_enter(&sb->sb_mtx);
 	m = sb->sb_mb;
 	memset(&sb->sb_startzero, 0,
 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
 	sb->sb_timeo_nsecs = INFSLP;
 	mtx_leave(&sb->sb_mtx);
-	sbunlock(so, sb);
+	sounlock_shared(so);
+	sbunlock(sb);
+
 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
 		(*pr->pr_domain->dom_dispose)(m);
 	m_purge(m);
 }
 
-void
-sorflush(struct socket *so)
-{
-	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
-		solock_shared(so);
-	sorflush_locked(so);
-	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
-		sounlock_shared(so);
-}
-
 #ifdef SOCKET_SPLICE
 
 #define so_splicelen	so_sp->ssp_len
@@ -1355,7 +1314,7 @@ sorflush(struct socket *so)
 int
 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
 {
-	struct file	*fp = NULL;
+	struct file	*fp;
 	struct socket	*sosp;
 	struct taskq	*tq;
 	int		 error = 0;
@@ -1367,6 +1326,29 @@ sosplice(struct socket *so, int fd, off_
 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
 		return (EINVAL);
 
+	/* If no fd is given, unsplice by removing existing link. */
+	if (fd < 0) {
+		if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
+			return (error);
+		solock(so);
+		if (so->so_options & SO_ACCEPTCONN) {
+			error = EOPNOTSUPP;
+			goto out;
+		}
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			error = ENOTCONN;
+			goto out;
+		}
+
+		if (so->so_sp && so->so_sp->ssp_socket)
+			sounsplice(so, so->so_sp->ssp_socket, 0);
+ out:
+		sounlock(so);
+		sbunlock(&so->so_rcv);
+		return (error);
+	}
+
 	if (sosplice_taskq == NULL) {
 		rw_enter_write(&sosplice_lock);
 		if (sosplice_taskq == NULL) {
@@ -1386,65 +1368,47 @@ sosplice(struct socket *so, int fd, off_
 		membar_consumer();
 	}
 
-	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
-		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
-			return (error);
-		solock(so);
-	} else {
-		solock(so);
-		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
-			sounlock(so);
-			return (error);
-		}
-	}
-
-	if (so->so_options & SO_ACCEPTCONN) {
-		error = EOPNOTSUPP;
-		goto out;
-	}
-	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
-	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
-		error = ENOTCONN;
-		goto out;
-	}
-	if (so->so_sp == NULL)
-		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
-
-	/* If no fd is given, unsplice by removing existing link. */
-	if (fd < 0) {
-		if (so->so_sp->ssp_socket)
-			sounsplice(so, so->so_sp->ssp_socket, 0);
-		goto out;
-	}
-
 	/* Find sosp, the drain socket where data will be spliced into. */
 	if ((error = getsock(curproc, fd, &fp)) != 0)
-		goto out;
+		return (error);
 	sosp = fp->f_data;
+
 	if (sosp->so_proto->pr_usrreqs->pru_send !=
 	    so->so_proto->pr_usrreqs->pru_send) {
 		error = EPROTONOSUPPORT;
-		goto out;
+		goto frele;
 	}
-	if (sosp->so_sp == NULL)
-		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
 
-	if ((error = sblock(sosp, &sosp->so_snd, SBL_WAIT)) != 0) {
-		goto out;
+	if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
+		goto frele;
+	if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) {
+		sbunlock(&so->so_rcv);
+		goto frele;
 	}
+	solock(so);
 
-	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
-		error = EBUSY;
+	if ((so->so_options & SO_ACCEPTCONN) ||
+	    (sosp->so_options & SO_ACCEPTCONN)) {
+		error = EOPNOTSUPP;
 		goto release;
 	}
-	if (sosp->so_options & SO_ACCEPTCONN) {
-		error = EOPNOTSUPP;
+	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+		error = ENOTCONN;
 		goto release;
 	}
 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
 		error = ENOTCONN;
 		goto release;
 	}
+	if (so->so_sp == NULL)
+		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
+	if (sosp->so_sp == NULL)
+		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
+	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
+		error = EBUSY;
+		goto release;
+	}
 
 	/* Splice so and sosp together. */
 	mtx_enter(&so->so_rcv.sb_mtx);
@@ -1472,18 +1436,11 @@ sosplice(struct socket *so, int fd, off_
 	}
 
  release:
-	sbunlock(sosp, &sosp->so_snd);
- out:
-	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
-		sounlock(so);
-		sbunlock(so, &so->so_rcv);
-	} else {
-		sbunlock(so, &so->so_rcv);
-		sounlock(so);
-	}
-
-	if (fp)
-		FRELE(fp, curproc);
+	sounlock(so);
+	sbunlock(&sosp->so_snd);
+	sbunlock(&so->so_rcv);
+ frele:
+	FRELE(fp, curproc);
 
 	return (error);
 }
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
diff -u -p -r1.154 uipc_socket2.c
--- sys/kern/uipc_socket2.c	7 May 2024 15:54:23 -0000	1.154
+++ sys/kern/uipc_socket2.c	7 May 2024 19:27:19 -0000
@@ -512,23 +512,19 @@ sbmtxassertlocked(struct socket *so, str
  * Wait for data to arrive at/drain from a socket buffer.
  */
 int
-sbwait_locked(struct socket *so, struct sockbuf *sb)
-{
-	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
-
-	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
-
-	sb->sb_flags |= SB_WAIT;
-	return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
-	    sb->sb_timeo_nsecs);
-}
-
-int
 sbwait(struct socket *so, struct sockbuf *sb)
 {
 	uint64_t timeo_nsecs;
 	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
 
+	if (sb->sb_flags & SB_MTXLOCK) {
+		MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
+
+		sb->sb_flags |= SB_WAIT;
+		return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
+		    sb->sb_timeo_nsecs);
+	}
+
 	soassertlocked(so);
 
 	mtx_enter(&sb->sb_mtx);
@@ -540,81 +536,26 @@ sbwait(struct socket *so, struct sockbuf
 }
 
 int
-sblock(struct socket *so, struct sockbuf *sb, int flags)
+sblock(struct sockbuf *sb, int flags)
 {
-	int error = 0, prio = PSOCK;
-
-	if (sb->sb_flags & SB_MTXLOCK) {
-		int rwflags = RW_WRITE;
-
-		if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
-			rwflags |= RW_INTR;
-		if (!(flags & SBL_WAIT))
-			rwflags |= RW_NOSLEEP;
-
-		error = rw_enter(&sb->sb_lock, rwflags);
-		if (error == EBUSY)
-			error = EWOULDBLOCK;
-		return error;
-	}
+	int rwflags = RW_WRITE, error;
 
-	soassertlocked(so);
-
-	mtx_enter(&sb->sb_mtx);
-	if ((sb->sb_flags & SB_LOCK) == 0) {
-		sb->sb_flags |= SB_LOCK;
-		goto out;
-	}
-	if ((flags & SBL_WAIT) == 0) {
-		error = EWOULDBLOCK;
-		goto out;
-	}
 	if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
-		prio |= PCATCH;
-
-	while (sb->sb_flags & SB_LOCK) {
-		sb->sb_flags |= SB_WANT;
-		mtx_leave(&sb->sb_mtx);
-		error = sosleep_nsec(so, &sb->sb_flags, prio, "sblock", INFSLP);
-		if (error)
-			return (error);
-		mtx_enter(&sb->sb_mtx);
-	}
-	sb->sb_flags |= SB_LOCK;
-out:
-	mtx_leave(&sb->sb_mtx);
-
-	return (error);
-}
-
-void
-sbunlock_locked(struct socket *so, struct sockbuf *sb)
-{
-	if (sb->sb_flags & SB_MTXLOCK) {
-		rw_exit(&sb->sb_lock);
-		return;
-	}
+		rwflags |= RW_INTR;
+	if (!(flags & SBL_WAIT))
+		rwflags |= RW_NOSLEEP;
 
-	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
+	error = rw_enter(&sb->sb_lock, rwflags);
+	if (error == EBUSY)
+		error = EWOULDBLOCK;
 
-	sb->sb_flags &= ~SB_LOCK;
-	if (sb->sb_flags & SB_WANT) {
-		sb->sb_flags &= ~SB_WANT;
-		wakeup(&sb->sb_flags);
-	}
+	return error;
 }
 
 void
-sbunlock(struct socket *so, struct sockbuf *sb)
+sbunlock(struct sockbuf *sb)
 {
-	if (sb->sb_flags & SB_MTXLOCK) {
-		rw_exit(&sb->sb_lock);
-		return;
-	}
-
-	mtx_enter(&sb->sb_mtx);
-	sbunlock_locked(so, sb);
-	mtx_leave(&sb->sb_mtx);
+	rw_exit(&sb->sb_lock);
 }
 
 /*
@@ -1128,7 +1069,7 @@ void
 sbflush(struct socket *so, struct sockbuf *sb)
 {
 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
-	KASSERT((sb->sb_flags & SB_LOCK) == 0);
+	rw_assert_unlocked(&sb->sb_lock);
 
 	while (sb->sb_mbcnt)
 		sbdrop(so, sb, (int)sb->sb_cc);
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.130 socketvar.h
--- sys/sys/socketvar.h	3 May 2024 17:43:09 -0000	1.130
+++ sys/sys/socketvar.h	7 May 2024 19:27:20 -0000
@@ -128,13 +128,11 @@ struct socket {
 		struct klist sb_klist;	/* process selecting read/write */
 	} so_rcv, so_snd;
 #define SB_MAX		(2*1024*1024)	/* default for max chars in sockbuf */
-#define SB_LOCK		0x0001		/* lock on data queue */
-#define SB_WANT		0x0002		/* someone is waiting to lock */
-#define SB_WAIT		0x0004		/* someone is waiting for data/space */
-#define SB_ASYNC	0x0010		/* ASYNC I/O, need signals */
-#define SB_SPLICE	0x0020		/* buffer is splice source or drain */
-#define SB_NOINTR	0x0040		/* operations not interruptible */
-#define SB_MTXLOCK	0x0080		/* sblock() doesn't need solock() */
+#define SB_WAIT		0x0001		/* someone is waiting for data/space */
+#define SB_ASYNC	0x0002		/* ASYNC I/O, need signals */
+#define SB_SPLICE	0x0004		/* buffer is splice source or drain */
+#define SB_NOINTR	0x0008		/* operations not interruptible */
+#define SB_MTXLOCK	0x0010		/* sblock() doesn't need solock() */
 
 	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
 	caddr_t	so_upcallarg;		/* Arg for above */
@@ -315,11 +313,10 @@ sbfree(struct socket *so, struct sockbuf
  * sleep is interruptible. Returns error without lock if
  * sleep is interrupted.
  */
-int sblock(struct socket *, struct sockbuf *, int);
+int sblock(struct sockbuf *, int);
 
 /* release lock on sockbuf sb */
-void sbunlock(struct socket *, struct sockbuf *);
-void sbunlock_locked(struct socket *, struct sockbuf *);
+void sbunlock(struct sockbuf *);
 
 #define	SB_EMPTY_FIXUP(sb) do {						\
 	if ((sb)->sb_mb == NULL) {					\
@@ -367,7 +364,6 @@ int	sbcheckreserve(u_long, u_long);
 int	sbchecklowmem(void);
 int	sbreserve(struct socket *, struct sockbuf *, u_long);
 int	sbwait(struct socket *, struct sockbuf *);
-int	sbwait_locked(struct socket *, struct sockbuf *);
 void	soinit(void);
 void	soabort(struct socket *);
 int	soaccept(struct socket *, struct mbuf *);