Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Turn sblock() to `sb_lock' rwlock(9) wrapper for all sockets
To:
tech@openbsd.org
Date:
Sat, 4 May 2024 02:57:00 +0300

Download raw body.

Thread
sblock() is fine-grained lock which serializes socket send and receive
routines on `so_rcv' or `so_snd' buffers. There is no big problem to
wait netlock while holding sblock().

So unify sblock() behaviour to all sockets. Now it should be always
taken before solock() in all involved paths as sosend(), soreceive(),
sorflush() and sosplice() as already done for "unlocked" socket buffers.

This unification removes a lot of temporary "sb_flags & SB_MTXLOCK" code
from sockets layer. This unification makes straight "solock()" and
"sblock()" lock order, no more solock() -> sblock() -> sounlock() ->
solock() -> sbunlock() -> sounlock() chains in sosend() and soreceive()
paths. This unification brings witness(4) support for sblock(), include
NFS involved sockets, which is useful.

Since the witness(4) support was introduced to sblock() with this diff,
some new witness reports appeared. That's normal, it's the first time
they were exposed. In addition to [1] "lock order reversal in soreceive
and NFS" thread, I could say that `so_snd' also produces witness
reports during regress/sys/ffs/nfs run.

witness: lock order reversal:
 1st 0xfffffd8430fe37d0 vmmaplk (&map->lock)
 2nd 0xffff80001ead3b00 sbufrcv (&so->so_rcv.sb_lock)
lock order [1] vmmaplk (&map->lock) -> [2] sbufrcv (&so->so_rcv.sb_lock)
lock order data 0xffffffff8217bc37 -> 0xffffffff8221a474 is missing
lock order [2] sbufrcv (&so->so_rcv.sb_lock) -> [1] vmmaplk (&map->lock)
#0  rw_enter_read+0x50
#1  uvmfault_lookup+0x8a
#2  uvm_fault_check+0x36
#3  uvm_fault+0xfb
#4  kpageflttrap+0x158
#5  kerntrap+0x94
#6  alltraps_kern_meltdown+0x7b
#7  copyout+0x57
#8  soreceive+0x94a
#9  recvit+0x1fd
#10 sys_recvfrom+0xa4
#11 syscall+0x588
#12 Xsyscall+0x128
witness: lock order reversal:
 1st 0xfffffd8430fe37d0 vmmaplk (&map->lock)
 2nd 0xffff80001ead3bd8 sbufsnd (&so->so_snd.sb_lock)
lock order [1] vmmaplk (&map->lock) -> [2] sbufsnd (&so->so_snd.sb_lock)
lock order data 0xffffffff8217bc37 -> 0xffffffff821e61ab is missing
lock order [2] sbufsnd (&so->so_snd.sb_lock) -> [1] vmmaplk (&map->lock)
#0  rw_enter_read+0x50
#1  uvmfault_lookup+0x8a
#2  uvm_fault_check+0x36
#3  uvm_fault+0xfb
#4  kpageflttrap+0x158
#5  kerntrap+0x94
#6  alltraps_kern_meltdown+0x7b
#7  _copyin+0x57
#8  m_getuio+0x168
#9  sosend+0x302
#10 dofilewritev+0x14e
#11 sys_write+0x55
#12 syscall+0x588
#13 Xsyscall+0x128
witness: lock order reversal:
 1st 0xffff8000007945f8 dklk (&diskp->dk_lock)
 2nd 0xffff80001ead3b00 sbufrcv (&so->so_rcv.sb_lock)
witness: incomplete path, depth 4
witness: lock order reversal:
 1st 0xffff8000007945f8 dklk (&diskp->dk_lock)
 2nd 0xffff80001ead3bd8 sbufsnd (&so->so_snd.sb_lock)
witness: incomplete path, depth 4

1. https://marc.info/?t=171379319500001&r=1&w=2

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.333 uipc_socket.c
--- sys/kern/uipc_socket.c	3 May 2024 17:43:09 -0000	1.333
+++ sys/kern/uipc_socket.c	3 May 2024 23:25:35 -0000
@@ -66,7 +66,6 @@ void	soreaper(void *);
 void	soput(void *);
 int	somove(struct socket *, int);
 void	sorflush(struct socket *);
-void	sorflush_locked(struct socket *);
 
 void	filt_sordetach(struct knote *kn);
 int	filt_soread(struct knote *kn, long hint);
@@ -606,11 +605,11 @@ sosend(struct socket *so, struct mbuf *a
 
 #define	snderr(errno)	{ error = errno; goto release; }
 
-	if (dosolock)
-		solock_shared(so);
 restart:
 	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
 		goto out;
+	if (dosolock)
+		solock_shared(so);
 	sb_mtx_lock(&so->so_snd);
 	so->so_snd.sb_state |= SS_ISSENDING;
 	do {
@@ -644,14 +643,11 @@ restart:
 			if (flags & MSG_DONTWAIT)
 				snderr(EWOULDBLOCK);
 			sbunlock(so, &so->so_snd);
-
-			if (so->so_snd.sb_flags & SB_MTXLOCK)
-				error = sbwait_locked(so, &so->so_snd);
-			else
-				error = sbwait(so, &so->so_snd);
-
+			error = sbwait(so, &so->so_snd);
 			so->so_snd.sb_state &= ~SS_ISSENDING;
 			sb_mtx_unlock(&so->so_snd);
+			if (dosolock)
+				sounlock_shared(so);
 			if (error)
 				goto out;
 			goto restart;
@@ -705,10 +701,10 @@ restart:
 release:
 	so->so_snd.sb_state &= ~SS_ISSENDING;
 	sb_mtx_unlock(&so->so_snd);
-	sbunlock(so, &so->so_snd);
-out:
 	if (dosolock)
 		sounlock_shared(so);
+	sbunlock(so, &so->so_snd);
+out:
 	m_freem(top);
 	m_freem(control);
 	return (error);
@@ -875,11 +871,11 @@ bad:
 	if (mp)
 		*mp = NULL;
 
-	if (dosolock)
-		solock_shared(so);
 restart:
 	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
-		goto out;
+		return (error);
+	if (dosolock)
+		solock_shared(so);
 	sb_mtx_lock(&so->so_rcv);
 
 	m = so->so_rcv.sb_mb;
@@ -944,25 +940,13 @@ restart:
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 
-		if (so->so_rcv.sb_flags & SB_MTXLOCK) {
-			sbunlock_locked(so, &so->so_rcv);
-			if (dosolock)
-				sounlock_shared(so);
-			error = sbwait_locked(so, &so->so_rcv);
-			sb_mtx_unlock(&so->so_rcv);
-			if (error)
-				return (error);
-			if (dosolock)
-				solock_shared(so);
-		} else {
-			sb_mtx_unlock(&so->so_rcv);
-			sbunlock(so, &so->so_rcv);
-			error = sbwait(so, &so->so_rcv);
-			if (error) {
-				sounlock_shared(so);
-				return (error);
-			}
-		}
+		sbunlock(so, &so->so_rcv);
+		error = sbwait(so, &so->so_rcv);
+		sb_mtx_unlock(&so->so_rcv);
+		if (dosolock)
+			sounlock_shared(so);
+		if (error)
+			return (error);
 		goto restart;
 	}
 dontblock:
@@ -1202,21 +1186,12 @@ dontblock:
 				break;
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
-			if (dosolock) {
+			if (sbwait(so, &so->so_rcv)) {
 				sb_mtx_unlock(&so->so_rcv);
-				error = sbwait(so, &so->so_rcv);
-				if (error) {
-					sbunlock(so, &so->so_rcv);
+				if (dosolock)
 					sounlock_shared(so);
-					return (0);
-				}
-				sb_mtx_lock(&so->so_rcv);
-			} else {
-				if (sbwait_locked(so, &so->so_rcv)) {
-					sb_mtx_unlock(&so->so_rcv);
-					sbunlock(so, &so->so_rcv);
-					return (0);
-				}
+				sbunlock(so, &so->so_rcv);
+				return (0);
 			}
 			if ((m = so->so_rcv.sb_mb) != NULL)
 				nextrecord = m->m_nextpkt;
@@ -1269,10 +1244,9 @@ dontblock:
 		*flagsp |= flags;
 release:
 	sb_mtx_unlock(&so->so_rcv);
-	sbunlock(so, &so->so_rcv);
-out:
 	if (dosolock)
 		sounlock_shared(so);
+	sbunlock(so, &so->so_rcv);
 	return (error);
 }
 
@@ -1302,48 +1276,33 @@ soshutdown(struct socket *so, int how)
 }
 
 void
-sorflush_locked(struct socket *so)
+sorflush(struct socket *so)
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct mbuf *m;
 	const struct protosw *pr = so->so_proto;
 	int error;
 
-	if ((sb->sb_flags & SB_MTXLOCK) == 0)
-		soassertlocked(so);
-
 	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
 	KASSERT(error == 0);
 
-	if (sb->sb_flags & SB_MTXLOCK)
-		solock(so);
+	solock(so);
 	socantrcvmore(so);
-	if (sb->sb_flags & SB_MTXLOCK)
-		sounlock(so);
-
 	mtx_enter(&sb->sb_mtx);
 	m = sb->sb_mb;
 	memset(&sb->sb_startzero, 0,
 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
 	sb->sb_timeo_nsecs = INFSLP;
 	mtx_leave(&sb->sb_mtx);
+	sounlock(so);
 	sbunlock(so, sb);
+
 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
 		(*pr->pr_domain->dom_dispose)(m);
 	m_purge(m);
 }
 
-void
-sorflush(struct socket *so)
-{
-	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
-		solock_shared(so);
-	sorflush_locked(so);
-	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
-		sounlock_shared(so);
-}
-
 #ifdef SOCKET_SPLICE
 
 #define so_splicelen	so_sp->ssp_len
@@ -1355,7 +1314,7 @@ sorflush(struct socket *so)
 int
 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
 {
-	struct file	*fp = NULL;
+	struct file	*fp;
 	struct socket	*sosp;
 	struct taskq	*tq;
 	int		 error = 0;
@@ -1367,6 +1326,29 @@ sosplice(struct socket *so, int fd, off_
 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
 		return (EINVAL);
 
+	/* If no fd is given, unsplice by removing existing link. */
+	if (fd < 0) {
+		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
+			return (error);
+		solock(so);
+		if (so->so_options & SO_ACCEPTCONN) {
+			error = EOPNOTSUPP;
+			goto out;
+		}
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			error = ENOTCONN;
+			goto out;
+		}
+
+		if (so->so_sp && so->so_sp->ssp_socket)
+			sounsplice(so, so->so_sp->ssp_socket, 0);
+ out:
+		sounlock(so);
+		sbunlock(so, &so->so_rcv);
+		return (error);
+	}
+
 	if (sosplice_taskq == NULL) {
 		rw_enter_write(&sosplice_lock);
 		if (sosplice_taskq == NULL) {
@@ -1386,65 +1368,47 @@ sosplice(struct socket *so, int fd, off_
 		membar_consumer();
 	}
 
-	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
-		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
-			return (error);
-		solock(so);
-	} else {
-		solock(so);
-		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
-			sounlock(so);
-			return (error);
-		}
-	}
-
-	if (so->so_options & SO_ACCEPTCONN) {
-		error = EOPNOTSUPP;
-		goto out;
-	}
-	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
-	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
-		error = ENOTCONN;
-		goto out;
-	}
-	if (so->so_sp == NULL)
-		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
-
-	/* If no fd is given, unsplice by removing existing link. */
-	if (fd < 0) {
-		if (so->so_sp->ssp_socket)
-			sounsplice(so, so->so_sp->ssp_socket, 0);
-		goto out;
-	}
-
 	/* Find sosp, the drain socket where data will be spliced into. */
 	if ((error = getsock(curproc, fd, &fp)) != 0)
-		goto out;
+		return (error);
 	sosp = fp->f_data;
+
 	if (sosp->so_proto->pr_usrreqs->pru_send !=
 	    so->so_proto->pr_usrreqs->pru_send) {
 		error = EPROTONOSUPPORT;
-		goto out;
+		goto frele;
 	}
-	if (sosp->so_sp == NULL)
-		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
 
+	if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
+		goto frele;
 	if ((error = sblock(sosp, &sosp->so_snd, SBL_WAIT)) != 0) {
-		goto out;
+		sbunlock(so, &so->so_rcv);
+		goto frele;
 	}
+	solock(so);
 
-	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
-		error = EBUSY;
+	if ((so->so_options & SO_ACCEPTCONN) ||
+	    (sosp->so_options & SO_ACCEPTCONN)) {
+		error = EOPNOTSUPP;
 		goto release;
 	}
-	if (sosp->so_options & SO_ACCEPTCONN) {
-		error = EOPNOTSUPP;
+	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+		error = ENOTCONN;
 		goto release;
 	}
 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
 		error = ENOTCONN;
 		goto release;
 	}
+	if (so->so_sp == NULL)
+		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
+	if (sosp->so_sp == NULL)
+		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
+	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
+		error = EBUSY;
+		goto release;
+	}
 
 	/* Splice so and sosp together. */
 	mtx_enter(&so->so_rcv.sb_mtx);
@@ -1472,18 +1436,11 @@ sosplice(struct socket *so, int fd, off_
 	}
 
  release:
+	sounlock(so);
 	sbunlock(sosp, &sosp->so_snd);
- out:
-	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
-		sounlock(so);
-		sbunlock(so, &so->so_rcv);
-	} else {
-		sbunlock(so, &so->so_rcv);
-		sounlock(so);
-	}
-
-	if (fp)
-		FRELE(fp, curproc);
+	sbunlock(so, &so->so_rcv);
+ frele:
+	FRELE(fp, curproc);
 
 	return (error);
 }
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
diff -u -p -r1.153 uipc_socket2.c
--- sys/kern/uipc_socket2.c	3 May 2024 17:43:09 -0000	1.153
+++ sys/kern/uipc_socket2.c	3 May 2024 23:25:36 -0000
@@ -512,23 +512,19 @@ sbmtxassertlocked(struct socket *so, str
  * Wait for data to arrive at/drain from a socket buffer.
  */
 int
-sbwait_locked(struct socket *so, struct sockbuf *sb)
-{
-	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
-
-	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
-
-	sb->sb_flags |= SB_WAIT;
-	return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
-	    sb->sb_timeo_nsecs);
-}
-
-int
 sbwait(struct socket *so, struct sockbuf *sb)
 {
 	uint64_t timeo_nsecs;
 	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
 
+	if (sb->sb_flags & SB_MTXLOCK) {
+		MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
+
+		sb->sb_flags |= SB_WAIT;
+		return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
+		    sb->sb_timeo_nsecs);
+	}
+
 	soassertlocked(so);
 
 	mtx_enter(&sb->sb_mtx);
@@ -542,76 +538,20 @@ sbwait(struct socket *so, struct sockbuf
 int
 sblock(struct socket *so, struct sockbuf *sb, int flags)
 {
-	int error = 0, prio = PSOCK;
-
-	if (sb->sb_flags & SB_MTXLOCK) {
-		int rwflags = RW_WRITE;
-
-		if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
-			rwflags |= RW_INTR;
-		if (!(flags & SBL_WAIT))
-			rwflags |= RW_NOSLEEP;
+	int rwflags = RW_WRITE;
 
-		return rw_enter(&sb->sb_lock, rwflags);
-	}
-
-	soassertlocked(so);
-
-	mtx_enter(&sb->sb_mtx);
-	if ((sb->sb_flags & SB_LOCK) == 0) {
-		sb->sb_flags |= SB_LOCK;
-		goto out;
-	}
-	if ((flags & SBL_WAIT) == 0) {
-		error = EWOULDBLOCK;
-		goto out;
-	}
 	if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
-		prio |= PCATCH;
-
-	while (sb->sb_flags & SB_LOCK) {
-		sb->sb_flags |= SB_WANT;
-		mtx_leave(&sb->sb_mtx);
-		error = sosleep_nsec(so, &sb->sb_flags, prio, "sblock", INFSLP);
-		if (error)
-			return (error);
-		mtx_enter(&sb->sb_mtx);
-	}
-	sb->sb_flags |= SB_LOCK;
-out:
-	mtx_leave(&sb->sb_mtx);
-
-	return (error);
-}
-
-void
-sbunlock_locked(struct socket *so, struct sockbuf *sb)
-{
-	if (sb->sb_flags & SB_MTXLOCK) {
-		rw_exit(&sb->sb_lock);
-		return;
-	}
-
-	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
+		rwflags |= RW_INTR;
+	if (!(flags & SBL_WAIT))
+		rwflags |= RW_NOSLEEP;
 
-	sb->sb_flags &= ~SB_LOCK;
-	if (sb->sb_flags & SB_WANT) {
-		sb->sb_flags &= ~SB_WANT;
-		wakeup(&sb->sb_flags);
-	}
+	return rw_enter(&sb->sb_lock, rwflags);
 }
 
 void
 sbunlock(struct socket *so, struct sockbuf *sb)
 {
-	if (sb->sb_flags & SB_MTXLOCK) {
-		rw_exit(&sb->sb_lock);
-		return;
-	}
-
-	mtx_enter(&sb->sb_mtx);
-	sbunlock_locked(so, sb);
-	mtx_leave(&sb->sb_mtx);
+	rw_exit(&sb->sb_lock);
 }
 
 /*
@@ -1125,7 +1065,7 @@ void
 sbflush(struct socket *so, struct sockbuf *sb)
 {
 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
-	KASSERT((sb->sb_flags & SB_LOCK) == 0);
+	rw_assert_unlocked(&sb->sb_lock);
 
 	while (sb->sb_mbcnt)
 		sbdrop(so, sb, (int)sb->sb_cc);
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.130 socketvar.h
--- sys/sys/socketvar.h	3 May 2024 17:43:09 -0000	1.130
+++ sys/sys/socketvar.h	3 May 2024 23:25:36 -0000
@@ -128,13 +128,11 @@ struct socket {
 		struct klist sb_klist;	/* process selecting read/write */
 	} so_rcv, so_snd;
 #define SB_MAX		(2*1024*1024)	/* default for max chars in sockbuf */
-#define SB_LOCK		0x0001		/* lock on data queue */
-#define SB_WANT		0x0002		/* someone is waiting to lock */
-#define SB_WAIT		0x0004		/* someone is waiting for data/space */
-#define SB_ASYNC	0x0010		/* ASYNC I/O, need signals */
-#define SB_SPLICE	0x0020		/* buffer is splice source or drain */
-#define SB_NOINTR	0x0040		/* operations not interruptible */
-#define SB_MTXLOCK	0x0080		/* sblock() doesn't need solock() */
+#define SB_WAIT		0x0001		/* someone is waiting for data/space */
+#define SB_ASYNC	0x0002		/* ASYNC I/O, need signals */
+#define SB_SPLICE	0x0004		/* buffer is splice source or drain */
+#define SB_NOINTR	0x0008		/* operations not interruptible */
+#define SB_MTXLOCK	0x0010		/* sblock() doesn't need solock() */
 
 	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
 	caddr_t	so_upcallarg;		/* Arg for above */
@@ -319,7 +317,6 @@ int sblock(struct socket *, struct sockb
 
 /* release lock on sockbuf sb */
 void sbunlock(struct socket *, struct sockbuf *);
-void sbunlock_locked(struct socket *, struct sockbuf *);
 
 #define	SB_EMPTY_FIXUP(sb) do {						\
 	if ((sb)->sb_mb == NULL) {					\
@@ -367,7 +364,6 @@ int	sbcheckreserve(u_long, u_long);
 int	sbchecklowmem(void);
 int	sbreserve(struct socket *, struct sockbuf *, u_long);
 int	sbwait(struct socket *, struct sockbuf *);
-int	sbwait_locked(struct socket *, struct sockbuf *);
 void	soinit(void);
 void	soabort(struct socket *);
 int	soaccept(struct socket *, struct mbuf *);