Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Use `sb_mtx' instead of `inp_mtx' in receive path for inet sockets
To:
Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Sun, 4 Feb 2024 20:40:13 +0300

Download raw body.

Thread
In soreceve(), we only touch `so_rcv' socket buffer, which has it's own
`sb_mtx' mutex(9) for protection. So, we can avoid solock() in this
path - it's enough to hold `sb_mtx' in soreceive() and around
corresponding sbappend*(). But not right now :)

This time we use shared netlock for some inet sockets in the soreceive()
path. To protect `so_rcv' buffer we use `inp_mtx' mutex(9) and the
prul_lock() to acquire this mutex(9) at socket layer. But the `inp_mtx'
mutex belongs to the PCB. We initialize socket before PCB, tcp(4)
sockets could exist without PCB, so I want to use different locks for
PCB and socket layers of socket.

This diff mechanically replaces `inp_mtx' by `sb_mtx' in the receive
path. Only for sockets which already use `inp_mtx'. All other sockets
left as is. They will be converted later.

Since the `sb_mtx' is optional, the new SB_MTXLOCK flag introduced. If
this flag is set on `sb_flags', the `sb_mtx' mutex(9) should be taken.
New sb_mtx_lock() and sb_mtx_unlock() was introduced to hide this check.
They are temporary and will be replaced by mtx_enter() when all this
area will be converted to `sb_mtx' mutex(9).

Also, the new sbmtxassertlocked() function introduced to throw
corresponding assertion for SB_MTXLOCK marked buffers. This time only
sbappendaddr() calls it. This function is also temporary and will be
replaced by MTX_ASSERT_LOCKED() later.

This naming could be strange, bu now we sblock() for different purpose,
so I want to keep it as is for a while.


Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.316
diff -u -p -r1.316 uipc_socket.c
--- sys/kern/uipc_socket.c	3 Feb 2024 22:50:08 -0000	1.316
+++ sys/kern/uipc_socket.c	4 Feb 2024 17:08:47 -0000
@@ -150,6 +150,18 @@ soalloc(const struct domain *dp, int wai
 	TAILQ_INIT(&so->so_q0);
 	TAILQ_INIT(&so->so_q);
 
+	switch (dp->dom_family) {
+	case AF_INET:
+	case AF_INET6:
+		switch (dp->dom_protosw->pr_type) {
+		case SOCK_DGRAM:
+		case SOCK_RAW:
+			so->so_rcv.sb_flags |= SB_MTXLOCK;
+			break;
+		}
+		break;
+	}
+
 	return (so);
 }
 
@@ -825,7 +837,7 @@ restart:
 		sounlock_shared(so);
 		return (error);
 	}
-	pru_lock(so);
+	sb_mtx_lock(&so->so_rcv);
 
 	m = so->so_rcv.sb_mb;
 #ifdef SOCKET_SPLICE
@@ -889,7 +901,7 @@ restart:
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 		sbunlock(so, &so->so_rcv);
-		pru_unlock(so);
+		sb_mtx_unlock(&so->so_rcv);
 		error = sbwait(so, &so->so_rcv);
 		if (error) {
 			sounlock_shared(so);
@@ -961,13 +973,13 @@ dontblock:
 			sbsync(&so->so_rcv, nextrecord);
 			if (controlp) {
 				if (pr->pr_domain->dom_externalize) {
-					pru_unlock(so);
+					sb_mtx_unlock(&so->so_rcv);
 					sounlock_shared(so);
 					error =
 					    (*pr->pr_domain->dom_externalize)
 					    (cm, controllen, flags);
 					solock_shared(so);
-					pru_lock(so);
+					sb_mtx_lock(&so->so_rcv);
 				}
 				*controlp = cm;
 			} else {
@@ -1041,11 +1053,11 @@ dontblock:
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 			resid = uio->uio_resid;
-			pru_unlock(so);
+			sb_mtx_unlock(&so->so_rcv);
 			sounlock_shared(so);
 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
 			solock_shared(so);
-			pru_lock(so);
+			sb_mtx_lock(&so->so_rcv);
 			if (uio_error)
 				uio->uio_resid = resid - len;
 		} else
@@ -1127,14 +1139,14 @@ dontblock:
 				break;
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
-			pru_unlock(so);
+			sb_mtx_unlock(&so->so_rcv);
 			error = sbwait(so, &so->so_rcv);
 			if (error) {
 				sbunlock(so, &so->so_rcv);
 				sounlock_shared(so);
 				return (0);
 			}
-			pru_lock(so);
+			sb_mtx_lock(&so->so_rcv);
 			if ((m = so->so_rcv.sb_mb) != NULL)
 				nextrecord = m->m_nextpkt;
 		}
@@ -1168,7 +1180,7 @@ dontblock:
 	    (flags & MSG_EOR) == 0 &&
 	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
 		sbunlock(so, &so->so_rcv);
-		pru_unlock(so);
+		sb_mtx_unlock(&so->so_rcv);
 		goto restart;
 	}
 
@@ -1179,7 +1191,7 @@ dontblock:
 		*flagsp |= flags;
 release:
 	sbunlock(so, &so->so_rcv);
-	pru_unlock(so);
+	sb_mtx_unlock(&so->so_rcv);
 	sounlock_shared(so);
 	return (error);
 }
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.141
diff -u -p -r1.141 uipc_socket2.c
--- sys/kern/uipc_socket2.c	3 Feb 2024 22:50:08 -0000	1.141
+++ sys/kern/uipc_socket2.c	4 Feb 2024 17:08:47 -0000
@@ -500,6 +500,16 @@ sosleep_nsec(struct socket *so, void *id
 	return ret;
 }
 
+void
+sbmtxassertlocked(struct socket *so, struct sockbuf *sb)
+{
+	if (sb->sb_flags & SB_MTXLOCK) {
+		if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0)
+			splassert_fail(0, RW_WRITE, __func__);
+	} else
+		soassertlocked(so);
+}
+
 /*
  * Wait for data to arrive at/drain from a socket buffer.
  */
@@ -919,7 +929,7 @@ sbappendaddr(struct socket *so, struct s
 	struct mbuf *m, *n, *nlast;
 	int space = asa->sa_len;
 
-	soassertlocked(so);
+	sbmtxassertlocked(so, sb);
 
 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
 		panic("sbappendaddr");
Index: sys/netinet/ip_divert.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_divert.c,v
retrieving revision 1.93
diff -u -p -r1.93 ip_divert.c
--- sys/netinet/ip_divert.c	3 Feb 2024 22:50:09 -0000	1.93
+++ sys/netinet/ip_divert.c	4 Feb 2024 17:08:47 -0000
@@ -241,14 +241,14 @@ divert_packet(struct mbuf *m, int dir, u
 		in_proto_cksum_out(m, NULL);
 	}
 
-	mtx_enter(&inp->inp_mtx);
 	so = inp->inp_socket;
+	mtx_enter(&so->so_rcv.sb_mtx);
 	if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
-		mtx_leave(&inp->inp_mtx);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		divstat_inc(divs_fullsock);
 		goto bad;
 	}
-	mtx_leave(&inp->inp_mtx);
+	mtx_leave(&so->so_rcv.sb_mtx);
 	sorwakeup(so);
 
 	in_pcbunref(inp);
Index: sys/netinet/ip_mroute.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_mroute.c,v
retrieving revision 1.140
diff -u -p -r1.140 ip_mroute.c
--- sys/netinet/ip_mroute.c	6 Dec 2023 09:27:17 -0000	1.140
+++ sys/netinet/ip_mroute.c	4 Feb 2024 17:08:47 -0000
@@ -1051,12 +1051,11 @@ int
 socket_send(struct socket *so, struct mbuf *mm, struct sockaddr_in *src)
 {
 	if (so != NULL) {
-		struct inpcb *inp = sotoinpcb(so);
 		int ret;
 
-		mtx_enter(&inp->inp_mtx);
+		mtx_enter(&so->so_rcv.sb_mtx);
 		ret = sbappendaddr(so, &so->so_rcv, sintosa(src), mm, NULL);
-		mtx_leave(&inp->inp_mtx);
+		mtx_leave(&so->so_rcv.sb_mtx);
 
 		if (ret != 0) {
 			sorwakeup(so);
Index: sys/netinet/raw_ip.c
===================================================================
RCS file: /cvs/src/sys/netinet/raw_ip.c,v
retrieving revision 1.155
diff -u -p -r1.155 raw_ip.c
--- sys/netinet/raw_ip.c	3 Feb 2024 22:50:09 -0000	1.155
+++ sys/netinet/raw_ip.c	4 Feb 2024 17:08:47 -0000
@@ -220,24 +220,24 @@ rip_input(struct mbuf **mp, int *offp, i
 		else
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		if (n != NULL) {
+			struct socket *so = inp->inp_socket;
 			int ret;
 
 			if (inp->inp_flags & INP_CONTROLOPTS ||
 			    inp->inp_socket->so_options & SO_TIMESTAMP)
 				ip_savecontrol(inp, &opts, ip, n);
 
-			mtx_enter(&inp->inp_mtx);
-			ret = sbappendaddr(inp->inp_socket,
-			    &inp->inp_socket->so_rcv,
+			mtx_enter(&so->so_rcv.sb_mtx);
+			ret = sbappendaddr(so, &so->so_rcv,
 			    sintosa(&ripsrc), n, opts);
-			mtx_leave(&inp->inp_mtx);
+			mtx_leave(&so->so_rcv.sb_mtx);
 
 			if (ret == 0) {
 				/* should notify about lost packet */
 				m_freem(n);
 				m_freem(opts);
 			} else
-				sorwakeup(inp->inp_socket);
+				sorwakeup(so);
 		}
 		in_pcbunref(inp);
 	}
Index: sys/netinet/udp_usrreq.c
===================================================================
RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.317
diff -u -p -r1.317 udp_usrreq.c
--- sys/netinet/udp_usrreq.c	3 Feb 2024 22:50:09 -0000	1.317
+++ sys/netinet/udp_usrreq.c	4 Feb 2024 17:08:47 -0000
@@ -697,15 +697,15 @@ udp_sbappend(struct inpcb *inp, struct m
 #endif
 	m_adj(m, hlen);
 
-	mtx_enter(&inp->inp_mtx);
+	mtx_enter(&so->so_rcv.sb_mtx);
 	if (sbappendaddr(so, &so->so_rcv, srcaddr, m, opts) == 0) {
-		mtx_leave(&inp->inp_mtx);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		udpstat_inc(udps_fullsock);
 		m_freem(m);
 		m_freem(opts);
 		return;
 	}
-	mtx_leave(&inp->inp_mtx);
+	mtx_leave(&so->so_rcv.sb_mtx);
 
 	sorwakeup(so);
 }
Index: sys/netinet6/ip6_divert.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_divert.c,v
retrieving revision 1.92
diff -u -p -r1.92 ip6_divert.c
--- sys/netinet6/ip6_divert.c	3 Feb 2024 22:50:09 -0000	1.92
+++ sys/netinet6/ip6_divert.c	4 Feb 2024 17:08:47 -0000
@@ -248,14 +248,14 @@ divert6_packet(struct mbuf *m, int dir, 
 		in6_proto_cksum_out(m, NULL);
 	}
 
-	mtx_enter(&inp->inp_mtx);
 	so = inp->inp_socket;
+	mtx_enter(&so->so_rcv.sb_mtx);
 	if (sbappendaddr(so, &so->so_rcv, sin6tosa(&sin6), m, NULL) == 0) {
-		mtx_leave(&inp->inp_mtx);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		div6stat_inc(div6s_fullsock);
 		goto bad;
 	}
-	mtx_leave(&inp->inp_mtx);
+	mtx_leave(&so->so_rcv.sb_mtx);
 	sorwakeup(so);
 
 	in_pcbunref(inp);
Index: sys/netinet6/ip6_mroute.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_mroute.c,v
retrieving revision 1.139
diff -u -p -r1.139 ip6_mroute.c
--- sys/netinet6/ip6_mroute.c	3 Feb 2024 22:50:09 -0000	1.139
+++ sys/netinet6/ip6_mroute.c	4 Feb 2024 17:08:47 -0000
@@ -856,17 +856,16 @@ int
 socket6_send(struct socket *so, struct mbuf *mm, struct sockaddr_in6 *src)
 {
 	if (so != NULL) {
-		struct inpcb *inp = sotoinpcb(so);
 		int ret;
 
-		mtx_enter(&inp->inp_mtx);
+		mtx_enter(&so->so_rcv.sb_mtx);
 		ret = sbappendaddr(so, &so->so_rcv, sin6tosa(src), mm, NULL);
-		if (ret != 0)
-			sorwakeup(so);
-		mtx_leave(&inp->inp_mtx);
+		mtx_leave(&so->so_rcv.sb_mtx);
 
-		if (ret != 0)
+		if (ret != 0) {
+			sorwakeup(so);
 			return 0;
+		}
 	}
 	m_freem(mm);
 	return -1;
Index: sys/netinet6/raw_ip6.c
===================================================================
RCS file: /cvs/src/sys/netinet6/raw_ip6.c,v
retrieving revision 1.180
diff -u -p -r1.180 raw_ip6.c
--- sys/netinet6/raw_ip6.c	3 Feb 2024 22:50:09 -0000	1.180
+++ sys/netinet6/raw_ip6.c	4 Feb 2024 17:08:47 -0000
@@ -263,6 +263,7 @@ rip6_input(struct mbuf **mp, int *offp, 
 		else
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		if (n != NULL) {
+			struct socket *so = inp->inp_socket;
 			int ret;
 
 			if (inp->inp_flags & IN6P_CONTROLOPTS)
@@ -270,11 +271,10 @@ rip6_input(struct mbuf **mp, int *offp, 
 			/* strip intermediate headers */
 			m_adj(n, *offp);
 
-			mtx_enter(&inp->inp_mtx);
-			ret = sbappendaddr(inp->inp_socket,
-			    &inp->inp_socket->so_rcv,
+			mtx_enter(&so->so_rcv.sb_mtx);
+			ret = sbappendaddr(so, &so->so_rcv,
 			    sin6tosa(&rip6src), n, opts);
-			mtx_leave(&inp->inp_mtx);
+			mtx_leave(&so->so_rcv.sb_mtx);
 
 			if (ret == 0) {
 				/* should notify about lost packet */
@@ -282,7 +282,7 @@ rip6_input(struct mbuf **mp, int *offp, 
 				m_freem(opts);
 				rip6stat_inc(rip6s_fullsock);
 			} else
-				sorwakeup(inp->inp_socket);
+				sorwakeup(so);
 		}
 		in_pcbunref(inp);
 	}
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.122
diff -u -p -r1.122 socketvar.h
--- sys/sys/socketvar.h	3 Feb 2024 22:50:09 -0000	1.122
+++ sys/sys/socketvar.h	4 Feb 2024 17:08:47 -0000
@@ -134,6 +134,7 @@ struct socket {
 #define	SB_ASYNC	0x10		/* ASYNC I/O, need signals */
 #define	SB_SPLICE	0x20		/* buffer is splice source or drain */
 #define	SB_NOINTR	0x40		/* operations not interruptible */
+#define SB_MTXLOCK	0x80		/* use sb_mtx for sockbuf protection */
 
 	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
 	caddr_t	so_upcallarg;		/* Arg for above */
@@ -196,6 +197,22 @@ sorele(struct socket *so)
 
 #define isspliced(so)		((so)->so_sp && (so)->so_sp->ssp_socket)
 #define issplicedback(so)	((so)->so_sp && (so)->so_sp->ssp_soback)
+
+static inline void
+sb_mtx_lock(struct sockbuf *sb)
+{
+	if (sb->sb_flags & SB_MTXLOCK)
+		mtx_enter(&sb->sb_mtx);
+}
+
+static inline void
+sb_mtx_unlock(struct sockbuf *sb)
+{
+	if (sb->sb_flags & SB_MTXLOCK)
+		mtx_leave(&sb->sb_mtx);
+}
+
+void	sbmtxassertlocked(struct socket *so, struct sockbuf *);
 
 /*
  * Do we need to notify the other side when I/O is possible?