Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: Relax sockets splicing locking
To:
Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Sat, 4 Jan 2025 14:06:14 +0300

Download raw body.

Thread
On Fri, Jan 03, 2025 at 05:22:51PM +0300, Vitaliy Makkoveev wrote:
> Sockets splicing works around sockets buffers which have their own locks
> for all socket types, especially sblock() on `so_snd' which keeps
> sockets being spliced. 
> 
>  - sosplice() does read-only sockets options and state checks, the only
>    modification is `so_sp' assignment. The SB_SPLICE bit modification,
>    `ssp_socket' and `ssp_soback' assignment protected with `sb_mtx'
>    mutex(9). PCB layer does corresponding checks with `sb_mtx' held, so
>    shared solock() is pretty enough in sosplice() path. Introduce
>    special sosplice_solock_pair() for that purpose.
> 
>  - sounsplice() requires shared socket lock only around so{r,w}wakeup
>    calls.
> 
>  - Push exclusive solock() down to tcp(4) case of somove(). Such sockets
>    are not ready do unlocked somove() yet.
> 

A little bit updated. The "Make WITNESS happy..." comment is not MP
specific, so rewrite it as XXX:

+	/*
+	 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be
+	 * spliced together.
+	 */

Also whitespace fixed.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.355 uipc_socket.c
--- sys/kern/uipc_socket.c	3 Jan 2025 12:56:14 -0000	1.355
+++ sys/kern/uipc_socket.c	4 Jan 2025 11:02:12 -0000
@@ -133,14 +133,29 @@ struct socket *
 soalloc(const struct protosw *prp, int wait)
 {
 	const struct domain *dp = prp->pr_domain;
+	const char *dom_name = dp->dom_name;
 	struct socket *so;
 
 	so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
 	    PR_ZERO);
 	if (so == NULL)
 		return (NULL);
-	rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
+
+#ifdef WITNESS
+	/*
+	 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be
+	 * spliced together.
+	 */
+	switch (dp->dom_family) {
+	case AF_INET:
+	case AF_INET6:
+		dom_name = "inet46";
+		break;
+	}
+#endif
+
 	refcnt_init(&so->so_refcnt);
+	rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK);
 	rw_init(&so->so_rcv.sb_lock, "sbufrcv");
 	rw_init(&so->so_snd.sb_lock, "sbufsnd");
 	mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0);
@@ -435,9 +450,7 @@ discard:
 
 			if (so->so_sp->ssp_soback == so)
 				freeing |= SOSP_FREEING_READ;
-			solock(soback);
 			sounsplice(so->so_sp->ssp_soback, so, freeing);
-			sounlock(soback);
 		}
 		sbunlock(&soback->so_rcv);
 		sorele(soback);
@@ -445,13 +458,14 @@ discard:
 notsplicedback:
 		sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
 		if (isspliced(so)) {
+			struct socket *sosp;
 			int freeing = SOSP_FREEING_READ;
 
 			if (so == so->so_sp->ssp_socket)
 				freeing |= SOSP_FREEING_WRITE;
-			solock(so);
+			sosp = soref(so->so_sp->ssp_socket);
 			sounsplice(so, so->so_sp->ssp_socket, freeing);
-			sounlock(so);
+			sorele(sosp);
 		}
 		sbunlock(&so->so_rcv);
 
@@ -1319,6 +1333,38 @@ sorflush(struct socket *so)
 #define so_idleto	so_sp->ssp_idleto
 #define so_splicetask	so_sp->ssp_task
 
+void
+sosplice_solock_pair(struct socket *so1, struct socket *so2)
+{
+	NET_LOCK_SHARED();
+
+	if (so1 == so2)
+		rw_enter_write(&so1->so_lock);
+	else if (so1 < so2) {
+		rw_enter_write(&so1->so_lock);
+		rw_enter_write(&so2->so_lock);
+	} else {
+		rw_enter_write(&so2->so_lock);
+		rw_enter_write(&so1->so_lock);
+	}
+}
+
+void
+sosplice_sounlock_pair(struct socket *so1, struct socket *so2)
+{
+	if (so1 == so2)
+		rw_exit_write(&so1->so_lock);
+	else if (so1 < so2) {
+		rw_exit_write(&so2->so_lock);
+		rw_exit_write(&so1->so_lock);
+	} else {
+		rw_exit_write(&so1->so_lock);
+		rw_exit_write(&so2->so_lock);
+	}
+
+	NET_UNLOCK_SHARED();
+}
+
 int
 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
 {
@@ -1338,10 +1384,11 @@ sosplice(struct socket *so, int fd, off_
 	if (fd < 0) {
 		if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
 			return (error);
-		solock(so);
-		if (so->so_sp && so->so_sp->ssp_socket)
+		if (so->so_sp && so->so_sp->ssp_socket) {
+			sosp = soref(so->so_sp->ssp_socket);
 			sounsplice(so, so->so_sp->ssp_socket, 0);
-		sounlock(so);
+			sorele(sosp);
+		}
 		sbunlock(&so->so_rcv);
 		return (0);
 	}
@@ -1382,7 +1429,7 @@ sosplice(struct socket *so, int fd, off_
 		sbunlock(&so->so_rcv);
 		goto frele;
 	}
-	solock(so);
+	sosplice_solock_pair(so, sosp);
 
 	if ((so->so_options & SO_ACCEPTCONN) ||
 	    (sosp->so_options & SO_ACCEPTCONN)) {
@@ -1430,8 +1477,9 @@ sosplice(struct socket *so, int fd, off_
 	mtx_leave(&sosp->so_snd.sb_mtx);
 	mtx_leave(&so->so_rcv.sb_mtx);
 
-	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
-		sounlock(so);
+	sosplice_sounlock_pair(so, sosp);
+	sbunlock(&sosp->so_snd);
+
 	if (somove(so, M_WAIT)) {
 		mtx_enter(&so->so_rcv.sb_mtx);
 		mtx_enter(&sosp->so_snd.sb_mtx);
@@ -1440,16 +1488,17 @@ sosplice(struct socket *so, int fd, off_
 		mtx_leave(&sosp->so_snd.sb_mtx);
 		mtx_leave(&so->so_rcv.sb_mtx);
 	}
-	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
-		solock(so);
+
+	sbunlock(&so->so_rcv);
+	FRELE(fp, curproc);
+	return (0);
 
  release:
-	sounlock(so);
+	sosplice_sounlock_pair(so, sosp);
 	sbunlock(&sosp->so_snd);
 	sbunlock(&so->so_rcv);
  frele:
 	FRELE(fp, curproc);
-
 	return (error);
 }
 
@@ -1457,7 +1506,6 @@ void
 sounsplice(struct socket *so, struct socket *sosp, int freeing)
 {
 	sbassertlocked(&so->so_rcv);
-	soassertlocked(so);
 
 	task_del(sosplice_taskq, &so->so_splicetask);
 	timeout_del(&so->so_idleto);
@@ -1471,10 +1519,23 @@ sounsplice(struct socket *so, struct soc
 	mtx_leave(&so->so_rcv.sb_mtx);
 
 	/* Do not wakeup a socket that is about to be freed. */
-	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
-		sorwakeup(so);
-	if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp))
-		sowwakeup(sosp);
+	if ((freeing & SOSP_FREEING_READ) == 0) {
+		int readable;
+
+		solock_shared(so);
+		mtx_enter(&so->so_rcv.sb_mtx);
+		readable = soreadable(so);
+		mtx_leave(&so->so_rcv.sb_mtx);
+		if (readable)
+			sorwakeup(so);
+		sounlock_shared(so);
+	}
+	if ((freeing & SOSP_FREEING_WRITE) == 0) {
+		solock_shared(sosp);
+		if (sowriteable(sosp))
+			sowwakeup(sosp);
+		sounlock_shared(sosp);
+	}
 }
 
 void
@@ -1483,17 +1544,14 @@ soidle(void *arg)
 	struct socket *so = arg;
 
 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
-	solock(so);
-	/*
-	 * Depending on socket type, sblock(&so->so_rcv) or solock()
-	 * is always held while modifying SB_SPLICE and
-	 * so->so_sp->ssp_socket.
-	 */
 	if (so->so_rcv.sb_flags & SB_SPLICE) {
-		so->so_error = ETIMEDOUT;
+		struct socket *sosp;
+
+		WRITE_ONCE(so->so_error, ETIMEDOUT);
+		sosp = soref(so->so_sp->ssp_socket);
 		sounsplice(so, so->so_sp->ssp_socket, 0);
+		sorele(sosp);
 	}
-	sounlock(so);
 	sbunlock(&so->so_rcv);
 }
 
@@ -1502,31 +1560,13 @@ sotask(void *arg)
 {
 	struct socket *so = arg;
 	int doyield = 0;
-	int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD);
-
-	/*
-	 * sblock() on `so_rcv' protects sockets from being unspliced
-	 * for UDP case. TCP sockets still rely on solock().
-	 */
 
 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
 	if (so->so_rcv.sb_flags & SB_SPLICE) {
-		struct socket *sosp = so->so_sp->ssp_socket;
-
-		if (sockstream) {
-			sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR);
-			solock(so);
+		if (so->so_proto->pr_flags & PR_WANTRCVD)
 			doyield = 1;
-		}
-
 		somove(so, M_DONTWAIT);
-
-		if (sockstream) {
-			sounlock(so);
-			sbunlock(&sosp->so_snd);
-		}
 	}
-
 	sbunlock(&so->so_rcv);
 
 	if (doyield) {
@@ -1553,11 +1593,11 @@ somove(struct socket *so, int wait)
 	int		 sockdgram = ((so->so_proto->pr_flags &
 			     PR_WANTRCVD) == 0);
 
-	if (sockdgram)
-		sbassertlocked(&so->so_rcv);
-	else {
-		sbassertlocked(&sosp->so_snd);
-		soassertlocked(so);
+	sbassertlocked(&so->so_rcv);
+
+	if (!sockdgram) {
+		sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
+		solock(so);
 	}
 
 	mtx_enter(&so->so_rcv.sb_mtx);
@@ -1862,12 +1902,15 @@ somove(struct socket *so, int wait)
 	mtx_leave(&sosp->so_snd.sb_mtx);
 	mtx_leave(&so->so_rcv.sb_mtx);
 
+	if (!sockdgram) {
+		sbunlock(&so->so_snd);
+		sounlock(so);
+	}
+
 	if (unsplice) {
-		if (sockdgram)
-			solock(so);
+		soref(sosp);
 		sounsplice(so, sosp, 0);
-		if (sockdgram)
-			sounlock(so);
+		sorele(sosp);
 
 		return (0);
 	}
Index: sys/netinet/tcp_usrreq.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
diff -u -p -r1.236 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c	1 Jan 2025 13:44:22 -0000	1.236
+++ sys/netinet/tcp_usrreq.c	4 Jan 2025 11:02:12 -0000
@@ -198,8 +198,10 @@ tcp_sogetpcb(struct socket *so, struct i
 	 * structure will point at a subsidiary (struct tcpcb).
 	 */
 	if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
-		if (so->so_error)
-			return so->so_error;
+		int error;
+
+		if ((error = READ_ONCE(so->so_error)))
+			return error;
 		return EINVAL;
 	}