Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Don't take solock() in soreceive() for SOCK_RAW inet sockets
To:
tech@openbsd.org
Date:
Sun, 31 Mar 2024 23:53:02 +0300

Download raw body.

Thread
Reduce netlock pressure for inet sockets.

These sockets are not connection oriented, they don't call pru_rcvd(),
they can't be spliced, they don't set `so_error'. Nothing to protect
with solock() in soreceive(), filt_soread() and filt_soexcept() paths.

`so_rcv' buffer protected by `sb_mtx' mutex(9), but since it released in
soreceive() path, sblock() required to serialize concurrent soreceive()
and sorflush() threads. Current sblock() is some kind of rwlock(9)
implementation, so introduce `sb_lock' and it directly for that purpose.

The sorflush() and callers were refactored to avoid solock() for raw
inet sockets. This done to avoid packet processing stop.

The new SB_STANDALONE flag is partially redundant WITH SB_OWNLOCK, but
they will be merged in one flag later.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.325
diff -u -p -r1.325 uipc_socket.c
--- sys/kern/uipc_socket.c	31 Mar 2024 14:01:28 -0000	1.325
+++ sys/kern/uipc_socket.c	31 Mar 2024 20:30:10 -0000
@@ -142,6 +142,8 @@ soalloc(const struct protosw *prp, int w
 		return (NULL);
 	rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
 	refcnt_init(&so->so_refcnt);
+	rw_init(&so->so_rcv.sb_lock, "sbufrcv");
+	rw_init(&so->so_snd.sb_lock, "sbufsnd");
 	mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
 	mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
 	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
@@ -155,10 +157,10 @@ soalloc(const struct protosw *prp, int w
 	case AF_INET6:
 		switch (prp->pr_type) {
 		case SOCK_DGRAM:
-			so->so_rcv.sb_flags |= SB_OWNLOCK;
-			/* FALLTHROUGH */
+			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+			break;
 		case SOCK_RAW:
-			so->so_rcv.sb_flags |= SB_MTXLOCK;
+			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_STANDALONE;
 			break;
 		}
 		break;
@@ -345,9 +347,22 @@ sofree(struct socket *so, int keep_lock)
 	}
 #endif /* SOCKET_SPLICE */
 	sbrelease(so, &so->so_snd);
-	sorflush(so);
-	if (!keep_lock)
+
+	/*
+	 * Regardless on '_locked' postfix, must release solock() before
+	 * call sorflush_locked() for SB_STANDALONE marked socket. Can't
+	 * release solock() and call sorflush() because solock() release
+	 * is unwanted for tcp(4) socket. 
+	 */
+
+	if (so->so_rcv.sb_flags & SB_STANDALONE)
 		sounlock(so);
+
+	sorflush_locked(so);
+
+	if (!((so->so_rcv.sb_flags & SB_STANDALONE) || keep_lock))
+		sounlock(so);
+
 #ifdef SOCKET_SPLICE
 	if (so->so_sp) {
 		/* Reuse splice idle, sounsplice() has been called before. */
@@ -815,6 +830,7 @@ soreceive(struct socket *so, struct mbuf
 	const struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 	size_t resid, orig_resid = uio->uio_resid;
+	int dosolock = ((so->so_rcv.sb_flags & SB_STANDALONE) == 0);
 
 	mp = mp0;
 	if (paddr)
@@ -844,12 +860,11 @@ bad:
 	if (mp)
 		*mp = NULL;
 
-	solock_shared(so);
+	if (dosolock)
+		solock_shared(so);
 restart:
-	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
-		sounlock_shared(so);
-		return (error);
-	}
+	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
+		goto out;
 	sb_mtx_lock(&so->so_rcv);
 
 	m = so->so_rcv.sb_mb;
@@ -914,14 +929,16 @@ restart:
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 
-		if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+		if (so->so_rcv.sb_flags & (SB_OWNLOCK | SB_STANDALONE)) {
 			sbunlock_locked(so, &so->so_rcv);
-			sounlock_shared(so);
+			if (dosolock)
+				sounlock_shared(so);
 			error = sbwait_locked(so, &so->so_rcv);
 			sb_mtx_unlock(&so->so_rcv);
 			if (error)
 				return (error);
-			solock_shared(so);
+			if (dosolock)
+				solock_shared(so);
 		} else {
 			sb_mtx_unlock(&so->so_rcv);
 			sbunlock(so, &so->so_rcv);
@@ -998,11 +1015,13 @@ dontblock:
 			if (controlp) {
 				if (pr->pr_domain->dom_externalize) {
 					sb_mtx_unlock(&so->so_rcv);
-					sounlock_shared(so);
+					if (dosolock)
+						sounlock_shared(so);
 					error =
 					    (*pr->pr_domain->dom_externalize)
 					    (cm, controllen, flags);
-					solock_shared(so);
+					if (dosolock)
+						solock_shared(so);
 					sb_mtx_lock(&so->so_rcv);
 				}
 				*controlp = cm;
@@ -1081,9 +1100,11 @@ dontblock:
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 			resid = uio->uio_resid;
 			sb_mtx_unlock(&so->so_rcv);
-			sounlock_shared(so);
+			if (dosolock)
+				sounlock_shared(so);
 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
-			solock_shared(so);
+			if (dosolock)
+				solock_shared(so);
 			sb_mtx_lock(&so->so_rcv);
 			if (uio_error)
 				uio->uio_resid = resid - len;
@@ -1166,14 +1187,21 @@ dontblock:
 				break;
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
-			sb_mtx_unlock(&so->so_rcv);
-			error = sbwait(so, &so->so_rcv);
-			if (error) {
-				sbunlock(so, &so->so_rcv);
-				sounlock_shared(so);
-				return (0);
+			if (dosolock) {
+				sb_mtx_unlock(&so->so_rcv);
+				error = sbwait(so, &so->so_rcv);
+				if (error) {
+					sbunlock(so, &so->so_rcv);
+					sounlock_shared(so);
+					return (0);
+				}
+				sb_mtx_lock(&so->so_rcv);
+			} else {
+				if (sbwait_locked(so, &so->so_rcv)) {
+					error = 0;
+					goto release;
+				}
 			}
-			sb_mtx_lock(&so->so_rcv);
 			if ((m = so->so_rcv.sb_mb) != NULL)
 				nextrecord = m->m_nextpkt;
 		}
@@ -1222,7 +1250,9 @@ dontblock:
 release:
 	sb_mtx_unlock(&so->so_rcv);
 	sbunlock(so, &so->so_rcv);
-	sounlock_shared(so);
+out:
+	if (dosolock)
+		sounlock_shared(so);
 	return (error);
 }
 
@@ -1231,7 +1261,6 @@ soshutdown(struct socket *so, int how)
 {
 	int error = 0;
 
-	solock(so);
 	switch (how) {
 	case SHUT_RD:
 		sorflush(so);
@@ -1240,25 +1269,29 @@ soshutdown(struct socket *so, int how)
 		sorflush(so);
 		/* FALLTHROUGH */
 	case SHUT_WR:
+		solock(so);
 		error = pru_shutdown(so);
+		sounlock(so);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
-	sounlock(so);
 
 	return (error);
 }
 
 void
-sorflush(struct socket *so)
+sorflush_locked(struct socket *so)
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct mbuf *m;
 	const struct protosw *pr = so->so_proto;
 	int error;
 
+	if ((sb->sb_flags & SB_STANDALONE) == 0)
+		soassertlocked(so);
+
 	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
 	KASSERT(error == 0);
@@ -1275,6 +1308,16 @@ sorflush(struct socket *so)
 	m_purge(m);
 }
 
+void
+sorflush(struct socket *so)
+{
+	if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
+		solock(so);
+	sorflush_locked(so);
+	if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
+		sounlock(so);
+}
+
 #ifdef SOCKET_SPLICE
 
 #define so_splicelen	so_sp->ssp_len
@@ -1913,7 +1956,8 @@ sosetopt(struct socket *so, int level, i
 			if ((long)cnt <= 0)
 				cnt = 1;
 
-			solock(so);
+			if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
+				solock(so);
 			mtx_enter(&sb->sb_mtx);
 
 			switch (optname) {
@@ -1939,7 +1983,8 @@ sosetopt(struct socket *so, int level, i
 			}
 
 			mtx_leave(&sb->sb_mtx);
-			sounlock(so);
+			if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
+				sounlock(so);
 
 			break;
 		    }
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.147
diff -u -p -r1.147 uipc_socket2.c
--- sys/kern/uipc_socket2.c	31 Mar 2024 13:50:00 -0000	1.147
+++ sys/kern/uipc_socket2.c	31 Mar 2024 20:30:10 -0000
@@ -350,7 +350,9 @@ socantsendmore(struct socket *so)
 void
 socantrcvmore(struct socket *so)
 {
-	soassertlocked(so);
+	if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
+		soassertlocked(so);
+
 	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
 	mtx_leave(&so->so_rcv.sb_mtx);
@@ -557,6 +559,17 @@ sblock(struct socket *so, struct sockbuf
 {
 	int error = 0, prio = PSOCK;
 
+	if (sb->sb_flags & SB_STANDALONE) {
+		int rwflags = RW_WRITE;
+
+		if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
+			rwflags |= RW_INTR;
+		if (!(flags & SBL_WAIT))
+			rwflags |= RW_NOSLEEP;
+
+		return rw_enter(&sb->sb_lock, rwflags);
+	}
+
 	soassertlocked(so);
 
 	mtx_enter(&sb->sb_mtx);
@@ -589,6 +602,11 @@ out:
 void
 sbunlock_locked(struct socket *so, struct sockbuf *sb)
 {
+	if (sb->sb_flags & SB_STANDALONE) {
+		rw_exit(&sb->sb_lock);
+		return;
+	}
+
 	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
 
 	sb->sb_flags &= ~SB_LOCK;
@@ -601,6 +619,11 @@ sbunlock_locked(struct socket *so, struc
 void
 sbunlock(struct socket *so, struct sockbuf *sb)
 {
+	if (sb->sb_flags & SB_STANDALONE) {
+		rw_exit(&sb->sb_lock);
+		return;
+	}
+
 	mtx_enter(&sb->sb_mtx);
 	sbunlock_locked(so, sb);
 	mtx_leave(&sb->sb_mtx);
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.203
diff -u -p -r1.203 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c	26 Mar 2024 09:46:47 -0000	1.203
+++ sys/kern/uipc_usrreq.c	31 Mar 2024 20:30:10 -0000
@@ -1450,9 +1450,7 @@ unp_gc(void *arg __unused)
 				 * thread.
 				 */
 				so = unp->unp_socket;
-				solock(so);
 				sorflush(so);
-				sounlock(so);
 			}
 		}
 	}
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.127
diff -u -p -r1.127 socketvar.h
--- sys/sys/socketvar.h	27 Mar 2024 22:47:53 -0000	1.127
+++ sys/sys/socketvar.h	31 Mar 2024 20:30:10 -0000
@@ -106,7 +106,8 @@ struct socket {
  * Variables for socket buffering.
  */
 	struct	sockbuf {
-		struct mutex sb_mtx;
+		struct rwlock sb_lock; 
+		struct mutex  sb_mtx;
 /* The following fields are all zeroed on flush. */
 #define	sb_startzero	sb_cc
 		u_long	sb_cc;		/* actual chars in buffer */
@@ -136,6 +137,7 @@ struct socket {
 #define SB_NOINTR	0x0040		/* operations not interruptible */
 #define SB_MTXLOCK	0x0080		/* use sb_mtx for sockbuf protection */
 #define SB_OWNLOCK	0x0100		/* sb_mtx used standalone */
+#define SB_STANDALONE	0x0200		/* standalone sblock() */
 
 	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
 	caddr_t	so_upcallarg;		/* Arg for above */
@@ -401,6 +403,7 @@ int	sosend(struct socket *, struct mbuf 
 int	sosetopt(struct socket *, int, int, struct mbuf *);
 int	soshutdown(struct socket *, int);
 void	sorflush(struct socket *);
+void	sorflush_locked(struct socket *);
 void	sowakeup(struct socket *, struct sockbuf *);
 void	sorwakeup(struct socket *);
 void	sowwakeup(struct socket *);