Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Fri, 3 May 2024 02:10:49 +0300

Download raw body.

Thread
On Fri, May 03, 2024 at 12:26:32AM +0200, Alexander Bluhm wrote:
> On Wed, May 01, 2024 at 01:01:22AM +0300, Vitaliy Makkoveev wrote:
> > On Tue, Apr 30, 2024 at 11:01:04PM +0300, Vitaliy Makkoveev wrote:
> > > Use `sb_mtx' mutex(9) and `sb_lock' rwlock(9) to protect `so_snd' and
> > > `so_rcv' of unix(4) sockets.
> > > 
> > > The transmission of unix(4) sockets already half-unlocked because
> > > connected peer is not locked by solock() during sbappend*() call. Since
> > > the `so_snd' is protected by `sb_mtx' mutex(9) the re-locking is not
> > > required in uipc_rcvd() too.
> > > 
> > > SB_OWNLOCK became redundant with SB_MTXLOCK, so remove it. SB_MTXLOCK
> > > was kept because checks against SB_MTXLOCK within sb*() routines look
> > > more consistent to me.
> > > 
> > > Please note, the unlocked peer `so2' of unix(4) can't be disconnected
> > > while solock() is held on `so'. That's why unlocked sorwakeup() and
> > > sowwakeup() are fine, corresponding paths will never be followed.
> > > 
> > 
> > Add forgotten fifofs chunks. Against previous, this diff does direct
> > `so_rcv' dispose and cleanup in sofree(). This sockets is almost dead
> > and unlinked from everywhere include spliced peer. So concurrent
> > sotask() thread will just exit. This required to solve inode and so_rcv
> > locks ordering. Also this removes re-locking from sofree() for all
> > sockets.
> 
> Passed regress with witness.
> 
> One question, otherwise OK bluhm@
> 
> [skip] 
> 
> You change the order of EPIPE and ENOTCONN error checks here.
> The difference is relevant, EPIPE may signal a SIGPIPE.
> 
> When writing into a socket that stops working, the error should be
> EPIPE.  If there are two processes, A is writing to B, and B
> disconnects the socket, then A should get a EPIPE.
> 
> Do you change the behavior?
> 
> In sosend() we have a SS_CANTSENDMORE check with EPIPE error.
> So maybe the uipc_send() errors are irrelevant.
> 

Yes, I change behavior. I had concerns , so I checked FreeBSD version of
uipc_send() and found that they do "so->so_state & SS_ISCONNECTED" check
before "so->so_snd.sb_state & SBS_CANTSENDMORE". But sosend_generic()
has the different order. Since this code exists for a long time, I found
this order not significant.

However, our fifofs is the only place where we do direct SS_CANTSENDMORE
flag modifications. We could keep solock() around them for serialization
with uipc_send() and keep existing checks order.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.332 uipc_socket.c
--- sys/kern/uipc_socket.c	2 May 2024 11:55:31 -0000	1.332
+++ sys/kern/uipc_socket.c	2 May 2024 22:43:12 -0000
@@ -159,14 +159,15 @@ soalloc(const struct protosw *prp, int w
 	case AF_INET6:
 		switch (prp->pr_type) {
 		case SOCK_RAW:
-			so->so_snd.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+			so->so_snd.sb_flags |= SB_MTXLOCK;
 			/* FALLTHROUGH */
 		case SOCK_DGRAM:
-			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+			so->so_rcv.sb_flags |= SB_MTXLOCK;
 			break;
 		}
 		break;
 	case AF_UNIX:
+		so->so_snd.sb_flags |= SB_MTXLOCK;
 		so->so_rcv.sb_flags |= SB_MTXLOCK;
 		break;
 	}
@@ -354,18 +355,16 @@ sofree(struct socket *so, int keep_lock)
 	mtx_leave(&so->so_snd.sb_mtx);
 
 	/*
-	 * Regardless on '_locked' postfix, must release solock() before
-	 * call sorflush_locked() for SB_OWNLOCK marked socket. Can't
-	 * release solock() and call sorflush() because solock() release
-	 * is unwanted for tcp(4) socket. 
+	 * Unlocked dispose and cleanup is safe. Socket is unlinked
+	 * from everywhere. Even concurrent sotask() thread will not
+	 * call somove().
 	 */
+	if (so->so_proto->pr_flags & PR_RIGHTS &&
+	    so->so_proto->pr_domain->dom_dispose)
+		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+	m_purge(so->so_rcv.sb_mb);
 
-	if (so->so_rcv.sb_flags & SB_OWNLOCK)
-		sounlock(so);
-
-	sorflush_locked(so);
-
-	if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock))
+	if (!keep_lock)
 		sounlock(so);
 
 #ifdef SOCKET_SPLICE
@@ -574,7 +573,7 @@ sosend(struct socket *so, struct mbuf *a
 	size_t resid;
 	int error;
 	int atomic = sosendallatonce(so) || top;
-	int dosolock = ((so->so_snd.sb_flags & SB_OWNLOCK) == 0);
+	int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0);
 
 	if (uio)
 		resid = uio->uio_resid;
@@ -846,7 +845,7 @@ soreceive(struct socket *so, struct mbuf
 	const struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 	size_t resid, orig_resid = uio->uio_resid;
-	int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0);
+	int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0);
 
 	mp = mp0;
 	if (paddr)
@@ -945,7 +944,7 @@ restart:
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 
-		if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) {
+		if (so->so_rcv.sb_flags & SB_MTXLOCK) {
 			sbunlock_locked(so, &so->so_rcv);
 			if (dosolock)
 				sounlock_shared(so);
@@ -1247,7 +1246,11 @@ dontblock:
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
 		if (pr->pr_flags & PR_WANTRCVD) {
 			sb_mtx_unlock(&so->so_rcv);
+			if (!dosolock)
+				solock_shared(so);
 			pru_rcvd(so);
+			if (!dosolock)
+				sounlock_shared(so);
 			sb_mtx_lock(&so->so_rcv);
 		}
 	}
@@ -1306,17 +1309,17 @@ sorflush_locked(struct socket *so)
 	const struct protosw *pr = so->so_proto;
 	int error;
 
-	if ((sb->sb_flags & SB_OWNLOCK) == 0)
+	if ((sb->sb_flags & SB_MTXLOCK) == 0)
 		soassertlocked(so);
 
 	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
 	KASSERT(error == 0);
 
-	if (sb->sb_flags & SB_OWNLOCK)
+	if (sb->sb_flags & SB_MTXLOCK)
 		solock(so);
 	socantrcvmore(so);
-	if (sb->sb_flags & SB_OWNLOCK)
+	if (sb->sb_flags & SB_MTXLOCK)
 		sounlock(so);
 
 	mtx_enter(&sb->sb_mtx);
@@ -1334,10 +1337,10 @@ sorflush_locked(struct socket *so)
 void
 sorflush(struct socket *so)
 {
-	if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
 		solock_shared(so);
 	sorflush_locked(so);
-	if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
 		sounlock_shared(so);
 }
 
@@ -1383,7 +1386,7 @@ sosplice(struct socket *so, int fd, off_
 		membar_consumer();
 	}
 
-	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
 		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
 			return (error);
 		solock(so);
@@ -1471,7 +1474,7 @@ sosplice(struct socket *so, int fd, off_
  release:
 	sbunlock(sosp, &sosp->so_snd);
  out:
-	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+	if (so->so_rcv.sb_flags & SB_MTXLOCK) {
 		sounlock(so);
 		sbunlock(so, &so->so_rcv);
 	} else {
@@ -1885,7 +1888,8 @@ sorwakeup(struct socket *so)
 void
 sowwakeup(struct socket *so)
 {
-	soassertlocked_readonly(so);
+	if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
+		soassertlocked_readonly(so);
 
 #ifdef SOCKET_SPLICE
 	if (so->so_snd.sb_flags & SB_SPLICE)
@@ -1976,7 +1980,7 @@ sosetopt(struct socket *so, int level, i
 			if ((long)cnt <= 0)
 				cnt = 1;
 
-			if (((sb->sb_flags & SB_OWNLOCK) == 0))
+			if (((sb->sb_flags & SB_MTXLOCK) == 0))
 				solock(so);
 			mtx_enter(&sb->sb_mtx);
 
@@ -2003,7 +2007,7 @@ sosetopt(struct socket *so, int level, i
 			}
 
 			mtx_leave(&sb->sb_mtx);
-			if (((sb->sb_flags & SB_OWNLOCK) == 0))
+			if (((sb->sb_flags & SB_MTXLOCK) == 0))
 				sounlock(so);
 
 			break;
@@ -2380,7 +2384,8 @@ filt_sowrite(struct knote *kn, long hint
 	int rv;
 
 	MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
-	soassertlocked_readonly(so);
+	if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
+		soassertlocked_readonly(so);
 
 	kn->kn_data = sbspace(so, &so->so_snd);
 	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
diff -u -p -r1.152 uipc_socket2.c
--- sys/kern/uipc_socket2.c	2 May 2024 21:26:52 -0000	1.152
+++ sys/kern/uipc_socket2.c	2 May 2024 22:43:12 -0000
@@ -228,9 +228,10 @@ sonewconn(struct socket *head, int conns
 	 */
 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat))
 		goto fail;
+
+	mtx_enter(&head->so_snd.sb_mtx);
 	so->so_snd.sb_wat = head->so_snd.sb_wat;
 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
-	mtx_enter(&head->so_snd.sb_mtx);
 	so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
 	mtx_leave(&head->so_snd.sb_mtx);
 
@@ -543,7 +544,7 @@ sblock(struct socket *so, struct sockbuf
 {
 	int error = 0, prio = PSOCK;
 
-	if (sb->sb_flags & SB_OWNLOCK) {
+	if (sb->sb_flags & SB_MTXLOCK) {
 		int rwflags = RW_WRITE;
 
 		if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
@@ -586,7 +587,7 @@ out:
 void
 sbunlock_locked(struct socket *so, struct sockbuf *sb)
 {
-	if (sb->sb_flags & SB_OWNLOCK) {
+	if (sb->sb_flags & SB_MTXLOCK) {
 		rw_exit(&sb->sb_lock);
 		return;
 	}
@@ -603,7 +604,7 @@ sbunlock_locked(struct socket *so, struc
 void
 sbunlock(struct socket *so, struct sockbuf *sb)
 {
-	if (sb->sb_flags & SB_OWNLOCK) {
+	if (sb->sb_flags & SB_MTXLOCK) {
 		rw_exit(&sb->sb_lock);
 		return;
 	}
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
diff -u -p -r1.205 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c	2 May 2024 17:10:55 -0000	1.205
+++ sys/kern/uipc_usrreq.c	2 May 2024 22:43:12 -0000
@@ -477,20 +477,24 @@ uipc_dgram_shutdown(struct socket *so)
 void
 uipc_rcvd(struct socket *so)
 {
+	struct unpcb *unp = sotounpcb(so);
 	struct socket *so2;
 
-	if ((so2 = unp_solock_peer(so)) == NULL)
+	if (unp->unp_conn == NULL)
 		return;
+	so2 = unp->unp_conn->unp_socket;
+
 	/*
 	 * Adjust backpressure on sender
 	 * and wakeup any waiting to write.
 	 */
 	mtx_enter(&so->so_rcv.sb_mtx);
+	mtx_enter(&so2->so_snd.sb_mtx);
 	so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
 	so2->so_snd.sb_cc = so->so_rcv.sb_cc;
+	mtx_leave(&so2->so_snd.sb_mtx);
 	mtx_leave(&so->so_rcv.sb_mtx);
 	sowwakeup(so2);
-	sounlock(so2);
 }
 
 int
@@ -525,11 +529,17 @@ uipc_send(struct socket *so, struct mbuf
 	 * send buffer counts to maintain backpressure.
 	 * Wake up readers.
 	 */
+	/*
+	 * sbappend*() should be serialized together
+	 * with so_snd modification.
+	 */
 	mtx_enter(&so2->so_rcv.sb_mtx);
+	mtx_enter(&so->so_snd.sb_mtx);
 	if (control) {
 		if (sbappendcontrol(so2, &so2->so_rcv, m, control)) {
 			control = NULL;
 		} else {
+			mtx_leave(&so->so_snd.sb_mtx);
 			mtx_leave(&so2->so_rcv.sb_mtx);
 			error = ENOBUFS;
 			goto dispose;
@@ -542,6 +552,7 @@ uipc_send(struct socket *so, struct mbuf
 	so->so_snd.sb_cc = so2->so_rcv.sb_cc;
 	if (so2->so_rcv.sb_cc > 0)
 		dowakeup = 1;
+	mtx_leave(&so->so_snd.sb_mtx);
 	mtx_leave(&so2->so_rcv.sb_mtx);
 
 	if (dowakeup)
Index: sys/miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
diff -u -p -r1.104 fifo_vnops.c
--- sys/miscfs/fifofs/fifo_vnops.c	26 Mar 2024 09:46:47 -0000	1.104
+++ sys/miscfs/fifofs/fifo_vnops.c	2 May 2024 22:43:12 -0000
@@ -174,9 +174,12 @@ fifo_open(void *v)
 			return (error);
 		}
 		fip->fi_readers = fip->fi_writers = 0;
+		/* Take solock() to serialize with uipc_send() */
 		solock(wso);
+		mtx_enter(&wso->so_snd.sb_mtx);
 		wso->so_snd.sb_state |= SS_CANTSENDMORE;
 		wso->so_snd.sb_lowat = PIPE_BUF;
+		mtx_leave(&wso->so_snd.sb_mtx);
 		sounlock(wso);
 	} else {
 		rso = fip->fi_readsock;
@@ -185,8 +188,11 @@ fifo_open(void *v)
 	if (ap->a_mode & FREAD) {
 		fip->fi_readers++;
 		if (fip->fi_readers == 1) {
+			/* Take solock() to serialize with uipc_send() */
 			solock(wso);
+			mtx_enter(&wso->so_snd.sb_mtx);
 			wso->so_snd.sb_state &= ~SS_CANTSENDMORE;
+			mtx_leave(&wso->so_snd.sb_mtx);
 			sounlock(wso);
 			if (fip->fi_writers > 0)
 				wakeup(&fip->fi_writers);
@@ -554,7 +560,6 @@ filt_fifowrite(struct knote *kn, long hi
 	struct socket *so = kn->kn_hook;
 	int rv;
 
-	soassertlocked(so);
 	MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
 
 	kn->kn_data = sbspace(so, &so->so_snd);
@@ -625,11 +630,9 @@ filt_fifowmodify(struct kevent *kev, str
 	struct socket *so = kn->kn_hook;
 	int rv;
 
-	solock(so);
 	mtx_enter(&so->so_snd.sb_mtx);
 	rv = knote_modify(kev, kn);
 	mtx_leave(&so->so_snd.sb_mtx);
-	sounlock(so);
 
 	return (rv);
 }
@@ -640,11 +643,9 @@ filt_fifowprocess(struct knote *kn, stru
 	struct socket *so = kn->kn_hook;
 	int rv;
 
-	solock(so);
 	mtx_enter(&so->so_snd.sb_mtx);
 	rv = knote_process(kn, kev);
 	mtx_leave(&so->so_snd.sb_mtx);
-	sounlock(so);
 
 	return (rv);
 }
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.129 socketvar.h
--- sys/sys/socketvar.h	11 Apr 2024 13:32:51 -0000	1.129
+++ sys/sys/socketvar.h	2 May 2024 22:43:12 -0000
@@ -134,8 +134,7 @@ struct socket {
 #define SB_ASYNC	0x0010		/* ASYNC I/O, need signals */
 #define SB_SPLICE	0x0020		/* buffer is splice source or drain */
 #define SB_NOINTR	0x0040		/* operations not interruptible */
-#define SB_MTXLOCK	0x0080		/* use sb_mtx for sockbuf protection */
-#define SB_OWNLOCK	0x0100		/* sblock() doesn't need solock() */
+#define SB_MTXLOCK	0x0080		/* sblock() doesn't need solock() */
 
 	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
 	caddr_t	so_upcallarg;		/* Arg for above */