Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: Use `sb_mtx' mutex(9) for the `sb_flags' protection
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Tue, 13 Feb 2024 17:11:12 +0300

Download raw body.

Thread
On Tue, Feb 13, 2024 at 02:05:22PM +0100, Alexander Bluhm wrote:
> On Tue, Feb 13, 2024 at 03:37:40PM +0300, Vitaliy Makkoveev wrote:
> > I want to do this before go forward.
> > 
> > Temporary, sockbuf structure members modification relies on `sb_mtx'
> > mutex(9) only while shared solock() held, meanwhile the rest relies on
> > exclusive solock(). Make the `sb_flags' protection consistent as it
> > already done for `sb_timeo_nsecs' and `sb_klist'.
> > 
> > The SB_SPLICE flag modification rests the special case. The `sb_mtx'
> > mutex(9) only keeps `sb_flags' consistent while SB_SPLICE flag modifying.
> > The socket splice paths serialization still rests with netlock.
> 
> Is is a good idea to rip apart the atomicity of so_rcv.sb_flags and
> so_snd.sb_flags?  Either the socket is async or it is not.  Now you
> have sockets that are async on the reviece side, but not on the
> send side.
> 

Is this the problem? The IO operations on `so_snd' and `so_rcv' are
asynchronous include the wakeup and SIGIO delivery. Also, we have
SB_ASYNC for each sockbuf, but no SO_ASYNC or SS_ASYNC for the whole
socket. `so_sigio' is common for the whole socket, but the flags are
not.

Also, there is no problem to take both mutexes together before modify
SS_ASYNC.

> Also for socket splicing so->so_rcv.sb_flags and sosp->so_snd.sb_flags
> should be consistent.  For now, exclusive net lock keeps things
> together.  But when you introduce a mutex it should not only look
> at individual fields, but also the context.
>

They still are. The SB_SPLICE modification on both `so' and `sosp' still
requires exclusive netlock to be take first before SB_SPLICE flag
modification and all other read-only SB_SPLICE check paths.

> > The SB_SPLICE flag modification rests the special case. The `sb_mtx'
> > mutex(9) only keeps `sb_flags' consistent while SB_SPLICE flag modifying.
> > The socket splice paths serialization still rests with netlock.

The paths sosplice(), somove(), sounsplice(), etc are netlock
serialized, but the `sb_flags' modification outside requires `sb_mtx'.
Outside and inside netlock protected section.

> short sb_flags and short sb_state live in the same integer.  I am
> not sure if we may have different locking strategies for them.  At
> least the alpha CPU may modify one of them with a common 32 access.
> 

This is the problem. So, I propose to use `sb_mtx' for `sb_state'
modification too.

Index: sys/kern/sys_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.61
diff -u -p -r1.61 sys_socket.c
--- sys/kern/sys_socket.c	15 Apr 2023 13:18:28 -0000	1.61
+++ sys/kern/sys_socket.c	13 Feb 2024 14:06:33 -0000
@@ -90,7 +90,8 @@ soo_ioctl(struct file *fp, u_long cmd, c
 		break;
 
 	case FIOASYNC:
-		solock(so);
+		mtx_enter(&so->so_rcv.sb_mtx);
+		mtx_enter(&so->so_snd.sb_mtx);
 		if (*(int *)data) {
 			so->so_rcv.sb_flags |= SB_ASYNC;
 			so->so_snd.sb_flags |= SB_ASYNC;
@@ -98,7 +99,8 @@ soo_ioctl(struct file *fp, u_long cmd, c
 			so->so_rcv.sb_flags &= ~SB_ASYNC;
 			so->so_snd.sb_flags &= ~SB_ASYNC;
 		}
-		sounlock(so);
+		mtx_leave(&so->so_snd.sb_mtx);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		break;
 
 	case FIONREAD:
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.320
diff -u -p -r1.320 uipc_socket.c
--- sys/kern/uipc_socket.c	12 Feb 2024 22:48:27 -0000	1.320
+++ sys/kern/uipc_socket.c	13 Feb 2024 14:06:33 -0000
@@ -143,8 +143,8 @@ soalloc(const struct protosw *prp, int w
 		return (NULL);
 	rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
 	refcnt_init(&so->so_refcnt);
-	mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
-	mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
+	mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", MTX_DUPOK);
+	mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", MTX_DUPOK);
 	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
 	klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
 	sigio_init(&so->so_sigio);
@@ -587,7 +587,9 @@ sosend(struct socket *so, struct mbuf *a
 restart:
 	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
 		goto out;
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state |= SS_ISSENDING;
+	mtx_leave(&so->so_snd.sb_mtx);
 	do {
 		if (so->so_snd.sb_state & SS_CANTSENDMORE)
 			snderr(EPIPE);
@@ -621,7 +623,9 @@ restart:
 				snderr(EWOULDBLOCK);
 			sbunlock(so, &so->so_snd);
 			error = sbwait(so, &so->so_snd);
+			mtx_enter(&so->so_snd.sb_mtx);
 			so->so_snd.sb_state &= ~SS_ISSENDING;
+			mtx_leave(&so->so_snd.sb_mtx);
 			if (error)
 				goto out;
 			goto restart;
@@ -663,7 +667,9 @@ restart:
 	} while (resid);
 
 release:
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state &= ~SS_ISSENDING;
+	mtx_leave(&so->so_snd.sb_mtx);
 	sbunlock(so, &so->so_snd);
 out:
 	sounlock_shared(so);
@@ -838,7 +844,7 @@ restart:
 		sounlock_shared(so);
 		return (error);
 	}
-	sb_mtx_lock(&so->so_rcv);
+	mtx_enter(&so->so_rcv.sb_mtx);
 
 	m = so->so_rcv.sb_mb;
 #ifdef SOCKET_SPLICE
@@ -901,7 +907,7 @@ restart:
 		}
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
-		sb_mtx_unlock(&so->so_rcv);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		sbunlock(so, &so->so_rcv);
 		error = sbwait(so, &so->so_rcv);
 		if (error) {
@@ -974,13 +980,13 @@ dontblock:
 			sbsync(&so->so_rcv, nextrecord);
 			if (controlp) {
 				if (pr->pr_domain->dom_externalize) {
-					sb_mtx_unlock(&so->so_rcv);
+					mtx_leave(&so->so_rcv.sb_mtx);
 					sounlock_shared(so);
 					error =
 					    (*pr->pr_domain->dom_externalize)
 					    (cm, controllen, flags);
 					solock_shared(so);
-					sb_mtx_lock(&so->so_rcv);
+					mtx_enter(&so->so_rcv.sb_mtx);
 				}
 				*controlp = cm;
 			} else {
@@ -988,8 +994,11 @@ dontblock:
 				 * Dispose of any SCM_RIGHTS message that went
 				 * through the read path rather than recv.
 				 */
-				if (pr->pr_domain->dom_dispose)
+				if (pr->pr_domain->dom_dispose) {
+					mtx_leave(&so->so_rcv.sb_mtx);
 					pr->pr_domain->dom_dispose(cm);
+					mtx_enter(&so->so_rcv.sb_mtx);
+				}
 				m_free(cm);
 			}
 		}
@@ -1054,11 +1063,11 @@ dontblock:
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 			resid = uio->uio_resid;
-			sb_mtx_unlock(&so->so_rcv);
+			mtx_leave(&so->so_rcv.sb_mtx);
 			sounlock_shared(so);
 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
 			solock_shared(so);
-			sb_mtx_lock(&so->so_rcv);
+			mtx_enter(&so->so_rcv.sb_mtx);
 			if (uio_error)
 				uio->uio_resid = resid - len;
 		} else
@@ -1140,14 +1149,14 @@ dontblock:
 				break;
 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
-			sb_mtx_unlock(&so->so_rcv);
+			mtx_leave(&so->so_rcv.sb_mtx);
 			error = sbwait(so, &so->so_rcv);
 			if (error) {
 				sbunlock(so, &so->so_rcv);
 				sounlock_shared(so);
 				return (0);
 			}
-			sb_mtx_lock(&so->so_rcv);
+			mtx_enter(&so->so_rcv.sb_mtx);
 			if ((m = so->so_rcv.sb_mb) != NULL)
 				nextrecord = m->m_nextpkt;
 		}
@@ -1174,13 +1183,16 @@ dontblock:
 		}
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
-		if (pr->pr_flags & PR_WANTRCVD)
+		if (pr->pr_flags & PR_WANTRCVD) {
+			mtx_leave(&so->so_rcv.sb_mtx);
 			pru_rcvd(so);
+			mtx_enter(&so->so_rcv.sb_mtx);
+		}
 	}
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 &&
 	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
-		sb_mtx_unlock(&so->so_rcv);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		sbunlock(so, &so->so_rcv);
 		goto restart;
 	}
@@ -1191,7 +1203,7 @@ dontblock:
 	if (flagsp)
 		*flagsp |= flags;
 release:
-	sb_mtx_unlock(&so->so_rcv);
+	mtx_leave(&so->so_rcv.sb_mtx);
 	sbunlock(so, &so->so_rcv);
 	sounlock_shared(so);
 	return (error);
@@ -1373,8 +1385,13 @@ sosplice(struct socket *so, int fd, off_
 	 * we sleep, the socket buffers are not marked as spliced yet.
 	 */
 	if (somove(so, M_WAIT)) {
+		mtx_enter(&so->so_rcv.sb_mtx);
 		so->so_rcv.sb_flags |= SB_SPLICE;
+		mtx_leave(&so->so_rcv.sb_mtx);
+
+		mtx_enter(&sosp->so_snd.sb_mtx);
 		sosp->so_snd.sb_flags |= SB_SPLICE;
+		mtx_leave(&sosp->so_snd.sb_mtx);
 	}
 
  release:
@@ -1400,8 +1417,15 @@ sounsplice(struct socket *so, struct soc
 
 	task_del(sosplice_taskq, &so->so_splicetask);
 	timeout_del(&so->so_idleto);
+
+	mtx_enter(&sosp->so_snd.sb_mtx);
 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
+	mtx_leave(&sosp->so_snd.sb_mtx);
+
+	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_flags &= ~SB_SPLICE;
+	mtx_leave(&so->so_rcv.sb_mtx);
+
 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
 	/* Do not wakeup a socket that is about to be freed. */
 	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.144
diff -u -p -r1.144 uipc_socket2.c
--- sys/kern/uipc_socket2.c	12 Feb 2024 22:48:27 -0000	1.144
+++ sys/kern/uipc_socket2.c	13 Feb 2024 14:06:33 -0000
@@ -140,26 +140,40 @@ void
 soisdisconnecting(struct socket *so)
 {
 	soassertlocked(so);
+
 	so->so_state &= ~SS_ISCONNECTING;
 	so->so_state |= SS_ISDISCONNECTING;
+	wakeup(&so->so_timeo);
+
+	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
+	mtx_leave(&so->so_rcv.sb_mtx);
+	sorwakeup(so);
+
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state |= SS_CANTSENDMORE;
-	wakeup(&so->so_timeo);
+	mtx_leave(&so->so_snd.sb_mtx);
 	sowwakeup(so);
-	sorwakeup(so);
 }
 
 void
 soisdisconnected(struct socket *so)
 {
 	soassertlocked(so);
+
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 	so->so_state |= SS_ISDISCONNECTED;
+	wakeup(&so->so_timeo);
+
+	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
+	mtx_leave(&so->so_rcv.sb_mtx);
+	sorwakeup(so);
+
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_state |= SS_CANTSENDMORE;
-	wakeup(&so->so_timeo);
+	mtx_leave(&so->so_snd.sb_mtx);
 	sowwakeup(so);
-	sorwakeup(so);
 }
 
 /*
@@ -521,12 +535,13 @@ int
 sbwait(struct socket *so, struct sockbuf *sb)
 {
 	uint64_t timeo_nsecs;
-	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
+	int prio;
 
 	soassertlocked(so);
 
 	mtx_enter(&sb->sb_mtx);
 	timeo_nsecs = sb->sb_timeo_nsecs;
+	prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
 	sb->sb_flags |= SB_WAIT;
 	mtx_leave(&sb->sb_mtx);
 
Index: sys/miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.103
diff -u -p -r1.103 fifo_vnops.c
--- sys/miscfs/fifofs/fifo_vnops.c	3 Feb 2024 22:50:09 -0000	1.103
+++ sys/miscfs/fifofs/fifo_vnops.c	13 Feb 2024 14:06:33 -0000
@@ -175,8 +175,10 @@ fifo_open(void *v)
 		}
 		fip->fi_readers = fip->fi_writers = 0;
 		solock(wso);
+		mtx_enter(&wso->so_snd.sb_mtx);
 		wso->so_snd.sb_state |= SS_CANTSENDMORE;
 		wso->so_snd.sb_lowat = PIPE_BUF;
+		mtx_leave(&wso->so_snd.sb_mtx);
 		sounlock(wso);
 	} else {
 		rso = fip->fi_readsock;
@@ -186,7 +188,9 @@ fifo_open(void *v)
 		fip->fi_readers++;
 		if (fip->fi_readers == 1) {
 			solock(wso);
+			mtx_enter(&wso->so_snd.sb_mtx);
 			wso->so_snd.sb_state &= ~SS_CANTSENDMORE;
+			mtx_leave(&wso->so_snd.sb_mtx);
 			sounlock(wso);
 			if (fip->fi_writers > 0)
 				wakeup(&fip->fi_writers);
@@ -201,7 +205,9 @@ fifo_open(void *v)
 		if (fip->fi_writers == 1) {
 			solock(rso);
 			rso->so_state &= ~SS_ISDISCONNECTED;
+			mtx_enter(&rso->so_rcv.sb_mtx);
 			rso->so_rcv.sb_state &= ~SS_CANTRCVMORE;
+			mtx_leave(&rso->so_rcv.sb_mtx);
 			sounlock(rso);
 			if (fip->fi_readers > 0)
 				wakeup(&fip->fi_readers);
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.400
diff -u -p -r1.400 tcp_input.c
--- sys/netinet/tcp_input.c	11 Feb 2024 01:27:45 -0000	1.400
+++ sys/netinet/tcp_input.c	13 Feb 2024 14:06:33 -0000
@@ -1898,10 +1898,12 @@ step6:
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
+			mtx_enter(&so->so_rcv.sb_mtx);
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SS_RCVATMARK;
+			mtx_leave(&so->so_rcv.sb_mtx);
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
Index: sys/nfs/nfs_socket.c
===================================================================
RCS file: /cvs/src/sys/nfs/nfs_socket.c,v
retrieving revision 1.145
diff -u -p -r1.145 nfs_socket.c
--- sys/nfs/nfs_socket.c	5 Feb 2024 20:21:39 -0000	1.145
+++ sys/nfs/nfs_socket.c	13 Feb 2024 14:06:33 -0000
@@ -369,11 +369,17 @@ nfs_connect(struct nfsmount *nmp, struct
 	}
 	solock(so);
 	error = soreserve(so, sndreserve, rcvreserve);
+	sounlock(so);
 	if (error)
-		goto bad_locked;
+		goto bad;
+
+	mtx_enter(&so->so_rcv.sb_mtx);
 	so->so_rcv.sb_flags |= SB_NOINTR;
+	mtx_leave(&so->so_rcv.sb_mtx);
+
+	mtx_enter(&so->so_snd.sb_mtx);
 	so->so_snd.sb_flags |= SB_NOINTR;
-	sounlock(so);
+	mtx_leave(&so->so_snd.sb_mtx);
 
 	m_freem(mopt);
 	m_freem(nam);
Index: sys/nfs/nfs_syscalls.c
===================================================================
RCS file: /cvs/src/sys/nfs/nfs_syscalls.c,v
retrieving revision 1.121
diff -u -p -r1.121 nfs_syscalls.c
--- sys/nfs/nfs_syscalls.c	5 Feb 2024 20:21:39 -0000	1.121
+++ sys/nfs/nfs_syscalls.c	13 Feb 2024 14:06:33 -0000
@@ -275,16 +275,17 @@ nfssvc_addsock(struct file *fp, struct m
 		sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m);
 		m_freem(m);
 	}
-	solock(so);
-	so->so_rcv.sb_flags &= ~SB_NOINTR;
+
 	mtx_enter(&so->so_rcv.sb_mtx);
+	so->so_rcv.sb_flags &= ~SB_NOINTR;
 	so->so_rcv.sb_timeo_nsecs = INFSLP;
 	mtx_leave(&so->so_rcv.sb_mtx);
-	so->so_snd.sb_flags &= ~SB_NOINTR;
+
 	mtx_enter(&so->so_snd.sb_mtx);
+	so->so_snd.sb_flags &= ~SB_NOINTR;
 	so->so_snd.sb_timeo_nsecs = INFSLP;
 	mtx_leave(&so->so_snd.sb_mtx);
-	sounlock(so);
+
 	if (tslp)
 		slp = tslp;
 	else {
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.124
diff -u -p -r1.124 socketvar.h
--- sys/sys/socketvar.h	12 Feb 2024 22:48:27 -0000	1.124
+++ sys/sys/socketvar.h	13 Feb 2024 14:06:34 -0000
@@ -49,6 +49,11 @@
 typedef	__socklen_t	socklen_t;	/* length type for network syscalls */
 #endif
 
+/*
+ * Locks used to protect struct members:
+ *	m	sb_mtx
+ */
+
 TAILQ_HEAD(soqhead, socket);
 
 /*
@@ -120,12 +125,12 @@ struct socket {
 		struct mbuf *sb_mbtail;	/* the last mbuf in the chain */
 		struct mbuf *sb_lastrecord;/* first mbuf of last record in
 					      socket buffer */
-		short	sb_flags;	/* flags, see below */
+		short	sb_flags;	/* [m] flags, see below */
 /* End area that is zeroed on flush. */
 #define	sb_endzero	sb_flags
 		short	sb_state;	/* socket state on sockbuf */
-		uint64_t sb_timeo_nsecs;/* timeout for read/write */
-		struct klist sb_klist;	/* process selecting read/write */
+		uint64_t sb_timeo_nsecs;/* [m] timeout for read/write */
+		struct klist sb_klist;	/* [m] process selecting read/write */
 	} so_rcv, so_snd;
 #define	SB_MAX		(2*1024*1024)	/* default for max chars in sockbuf */
 #define	SB_LOCK		0x01		/* lock on data queue */
@@ -221,8 +226,6 @@ static inline int
 sb_notify(struct socket *so, struct sockbuf *sb)
 {
 	int rv;
-
-	soassertlocked(so);
 
 	mtx_enter(&sb->sb_mtx);
 	rv = ((sb->sb_flags & (SB_WAIT|SB_ASYNC|SB_SPLICE)) != 0 ||