Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Don't take solock() in sosend() for SOCK_RAW sockets
To:
tech@openbsd.org
Date:
Fri, 19 Apr 2024 22:39:58 +0300

Download raw body.

Thread
  • Vitaliy Makkoveev:

    Don't take solock() in sosend() for SOCK_RAW sockets

Mostly inspired by udp(4) sockets. Parallel udp(4) input blocked by
somove() which requires to protect not only `so_rcv', but also `so_snd'
of spliced peer. However, for SOCK_DGRAM sockets, `sb_mtx' looks pretty
enough to serialize somove() and sosend().

Raw sockets are the simplest inet sockets, so I want to use them to
start landing `sb_mtx' to protect `so_snd' buffer. Now solock() is taken
only around pru_send*(), the rest of sosend() serialized by sblock() and
`sb_mtx' mutex(9). The unlocked SS_ISCONNECTED checks looks fine,
because rip{,6}_send() check it. Also, the SS_ISCONNECTED could be lost
due to solock() release around following m_getuio().

The profit of unlocked sosend() could be not obvious, but let's assume
it as pushing netlock deeper to make some checks and sleep arond
`so_snd' lockless. Also, for unix(4) lockless sosend() removes
re-locking from uipc_rcvd() in soreceive() path.

I have the same diff for udp(4) sockets, but I want to do somove()
refactoring separately.

Index: sys/kern/sys_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.64
diff -u -p -r1.64 sys_socket.c
--- sys/kern/sys_socket.c	11 Apr 2024 08:33:37 -0000	1.64
+++ sys/kern/sys_socket.c	19 Apr 2024 19:19:44 -0000
@@ -92,6 +92,7 @@ soo_ioctl(struct file *fp, u_long cmd, c
 	case FIOASYNC:
 		solock(so);
 		mtx_enter(&so->so_rcv.sb_mtx);
+		mtx_enter(&so->so_snd.sb_mtx);
 		if (*(int *)data) {
 			so->so_rcv.sb_flags |= SB_ASYNC;
 			so->so_snd.sb_flags |= SB_ASYNC;
@@ -99,6 +100,7 @@ soo_ioctl(struct file *fp, u_long cmd, c
 			so->so_rcv.sb_flags &= ~SB_ASYNC;
 			so->so_snd.sb_flags &= ~SB_ASYNC;
 		}
+		mtx_leave(&so->so_snd.sb_mtx);
 		mtx_leave(&so->so_rcv.sb_mtx);
 		sounlock(so);
 		break;
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.330
diff -u -p -r1.330 uipc_socket.c
--- sys/kern/uipc_socket.c	15 Apr 2024 21:31:29 -0000	1.330
+++ sys/kern/uipc_socket.c	19 Apr 2024 19:19:44 -0000
@@ -146,8 +146,8 @@ soalloc(const struct protosw *prp, int w
 	refcnt_init(&so->so_refcnt);
 	rw_init(&so->so_rcv.sb_lock, "sbufrcv");
 	rw_init(&so->so_snd.sb_lock, "sbufsnd");
-	mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
-	mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
+	mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0);
+	mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0);
 	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
 	klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
 	sigio_init(&so->so_sigio);
@@ -158,8 +158,10 @@ soalloc(const struct protosw *prp, int w
 	case AF_INET:
 	case AF_INET6:
 		switch (prp->pr_type) {
-		case SOCK_DGRAM:
 		case SOCK_RAW:
+			so->so_snd.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+			/* FALLTHROUGH */
+		case SOCK_DGRAM:
 			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
 			break;
 		}
@@ -346,7 +348,10 @@ sofree(struct socket *so, int keep_lock)
 		sounsplice(so, so->so_sp->ssp_socket, freeing);
 	}
 #endif /* SOCKET_SPLICE */
+
+	mtx_enter(&so->so_snd.sb_mtx);
 	sbrelease(so, &so->so_snd);
+	mtx_leave(&so->so_snd.sb_mtx);
 
 	/*
 	 * Regardless on '_locked' postfix, must release solock() before
@@ -569,6 +574,7 @@ sosend(struct socket *so, struct mbuf *a
 	size_t resid;
 	int error;
 	int atomic = sosendallatonce(so) || top;
+	int dosolock = ((so->so_snd.sb_flags & SB_OWNLOCK) == 0);
 
 	if (uio)
 		resid = uio->uio_resid;
@@ -601,16 +607,17 @@ sosend(struct socket *so, struct mbuf *a
 
 #define	snderr(errno)	{ error = errno; goto release; }
 
-	solock_shared(so);
+	if (dosolock)
+		solock_shared(so);
 restart:
 	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
 		goto out;
+	sb_mtx_lock(&so->so_snd);
 	so->so_snd.sb_state |= SS_ISSENDING;
 	do {
 		if (so->so_snd.sb_state & SS_CANTSENDMORE)
 			snderr(EPIPE);
-		if (so->so_error) {
-			error = so->so_error;
+		if ((error = READ_ONCE(so->so_error))) {
 			so->so_error = 0;
 			snderr(error);
 		}
@@ -638,8 +645,14 @@ restart:
 			if (flags & MSG_DONTWAIT)
 				snderr(EWOULDBLOCK);
 			sbunlock(so, &so->so_snd);
-			error = sbwait(so, &so->so_snd);
+
+			if (so->so_snd.sb_flags & SB_OWNLOCK)
+				error = sbwait_locked(so, &so->so_snd);
+			else
+				error = sbwait(so, &so->so_snd);
+
 			so->so_snd.sb_state &= ~SS_ISSENDING;
+			sb_mtx_unlock(&so->so_snd);
 			if (error)
 				goto out;
 			goto restart;
@@ -654,9 +667,11 @@ restart:
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 			} else {
-				sounlock_shared(so);
+				if (dosolock)
+					sounlock_shared(so);
 				error = m_getuio(&top, atomic, space, uio);
-				solock_shared(so);
+				if (dosolock)
+					solock_shared(so);
 				if (error)
 					goto release;
 				space -= top->m_pkthdr.len;
@@ -668,10 +683,18 @@ restart:
 				so->so_snd.sb_state &= ~SS_ISSENDING;
 			if (top && so->so_options & SO_ZEROIZE)
 				top->m_flags |= M_ZEROIZE;
+			if (!dosolock) {
+				mtx_leave(&so->so_snd.sb_mtx);
+				solock_shared(so);
+			}
 			if (flags & MSG_OOB)
 				error = pru_sendoob(so, top, addr, control);
 			else
 				error = pru_send(so, top, addr, control);
+			if (!dosolock) {
+				sounlock_shared(so);
+				mtx_enter(&so->so_snd.sb_mtx);
+			}
 			clen = 0;
 			control = NULL;
 			top = NULL;
@@ -682,9 +705,11 @@ restart:
 
 release:
 	so->so_snd.sb_state &= ~SS_ISSENDING;
+	sb_mtx_unlock(&so->so_snd);
 	sbunlock(so, &so->so_snd);
 out:
-	sounlock_shared(so);
+	if (dosolock)
+		sounlock_shared(so);
 	m_freem(top);
 	m_freem(control);
 	return (error);
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.149
diff -u -p -r1.149 uipc_socket2.c
--- sys/kern/uipc_socket2.c	11 Apr 2024 13:32:51 -0000	1.149
+++ sys/kern/uipc_socket2.c	19 Apr 2024 19:19:44 -0000
@@ -142,10 +142,15 @@ soisdisconnecting(struct socket *so)
 	soassertlocked(so);
 	so->so_state &= ~SS_ISCONNECTING;
 	so->so_state |= SS_ISDISCONNECTING;
+
 	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
 	mtx_leave(&so->so_rcv.sb_mtx);
+
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state |= SS_CANTSENDMORE;
+	mtx_leave(&so->so_snd.sb_mtx);
+
 	wakeup(&so->so_timeo);
 	sowwakeup(so);
 	sorwakeup(so);
@@ -157,10 +162,15 @@ soisdisconnected(struct socket *so)
 	soassertlocked(so);
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 	so->so_state |= SS_ISDISCONNECTED;
+
 	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
 	mtx_leave(&so->so_rcv.sb_mtx);
+
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state |= SS_CANTSENDMORE;
+	mtx_leave(&so->so_snd.sb_mtx);
+
 	wakeup(&so->so_timeo);
 	sowwakeup(so);
 	sorwakeup(so);
@@ -315,7 +325,9 @@ void
 socantsendmore(struct socket *so)
 {
 	soassertlocked(so);
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state |= SS_CANTSENDMORE;
+	mtx_leave(&so->so_snd.sb_mtx);
 	sowwakeup(so);
 }
 
@@ -666,6 +678,8 @@ soreserve(struct socket *so, u_long sndc
 {
 	soassertlocked(so);
 
+	mtx_enter(&so->so_rcv.sb_mtx);
+	mtx_enter(&so->so_snd.sb_mtx);
 	if (sbreserve(so, &so->so_snd, sndcc))
 		goto bad;
 	so->so_snd.sb_wat = sndcc;
@@ -673,8 +687,6 @@ soreserve(struct socket *so, u_long sndc
 		so->so_snd.sb_lowat = MCLBYTES;
 	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
 		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
-
-	mtx_enter(&so->so_rcv.sb_mtx);
 	if (sbreserve(so, &so->so_rcv, rcvcc)) {
 		mtx_leave(&so->so_rcv.sb_mtx);
 		goto bad2;
@@ -682,12 +694,15 @@ soreserve(struct socket *so, u_long sndc
 	so->so_rcv.sb_wat = rcvcc;
 	if (so->so_rcv.sb_lowat == 0)
 		so->so_rcv.sb_lowat = 1;
+	mtx_leave(&so->so_snd.sb_mtx);
 	mtx_leave(&so->so_rcv.sb_mtx);
 
 	return (0);
 bad2:
 	sbrelease(so, &so->so_snd);
 bad:
+	mtx_leave(&so->so_snd.sb_mtx);
+	mtx_leave(&so->so_rcv.sb_mtx);
 	return (ENOBUFS);
 }