From: Vitaliy Makkoveev Subject: Re: Don't take solock() in sosend() and soreceive() paths for unix(4) sockets To: Alexander Bluhm Cc: tech@openbsd.org Date: Fri, 3 May 2024 02:10:49 +0300 On Fri, May 03, 2024 at 12:26:32AM +0200, Alexander Bluhm wrote: > On Wed, May 01, 2024 at 01:01:22AM +0300, Vitaliy Makkoveev wrote: > > On Tue, Apr 30, 2024 at 11:01:04PM +0300, Vitaliy Makkoveev wrote: > > > Use `sb_mtx' mutex(9) and `sb_lock' rwlock(9) to protect `so_snd' and > > > `so_rcv' of unix(4) sockets. > > > > > > The transmission of unix(4) sockets already half-unlocked because > > > connected peer is not locked by solock() during sbappend*() call. Since > > > the `so_snd' is protected by `sb_mtx' mutex(9) the re-locking is not > > > required in uipc_rcvd() too. > > > > > > SB_OWNLOCK became redundant with SB_MTXLOCK, so remove it. SB_MTXLOCK > > > was kept because checks against SB_MTXLOCK within sb*() routines look > > > more consistent to me. > > > > > > Please note, the unlocked peer `so2' of unix(4) can't be disconnected > > > while solock() is held on `so'. That's why unlocked sorwakeup() and > > > sowwakeup() are fine, corresponding paths will never be followed. > > > > > > > Add forgotten fifofs chunks. Against previous, this diff does direct > > `so_rcv' dispose and cleanup in sofree(). This sockets is almost dead > > and unlinked from everywhere include spliced peer. So concurrent > > sotask() thread will just exit. This required to solve inode and so_rcv > > locks ordering. Also this removes re-locking from sofree() for all > > sockets. > > Passed regress with witness. > > One question, otherwise OK bluhm@ > > [skip] > > You change the order of EPIPE and ENOTCONN error checks here. > The difference is relevant, EPIPE may signal a SIGPIPE. > > When writing into a socket that stops working, the error should be > EPIPE. If there are two processes, A is writing to B, and B > disconnects the socket, then A should get a EPIPE. > > Do you change the behavior? > > In sosend() we have a SS_CANTSENDMORE check with EPIPE error. > So maybe the uipc_send() errors are irrelevant. > Yes, I change behavior. I had concerns , so I checked FreeBSD version of uipc_send() and found that they do "so->so_state & SS_ISCONNECTED" check before "so->so_snd.sb_state & SBS_CANTSENDMORE". But sosend_generic() has the different order. Since this code exists for a long time, I found this order not significant. However, our fifofs is the only place where we do direct SS_CANTSENDMORE flag modifications. We could keep solock() around them for serialization with uipc_send() and keep existing checks order. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.332 uipc_socket.c --- sys/kern/uipc_socket.c 2 May 2024 11:55:31 -0000 1.332 +++ sys/kern/uipc_socket.c 2 May 2024 22:43:12 -0000 @@ -159,14 +159,15 @@ soalloc(const struct protosw *prp, int w case AF_INET6: switch (prp->pr_type) { case SOCK_RAW: - so->so_snd.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + so->so_snd.sb_flags |= SB_MTXLOCK; /* FALLTHROUGH */ case SOCK_DGRAM: - so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + so->so_rcv.sb_flags |= SB_MTXLOCK; break; } break; case AF_UNIX: + so->so_snd.sb_flags |= SB_MTXLOCK; so->so_rcv.sb_flags |= SB_MTXLOCK; break; } @@ -354,18 +355,16 @@ sofree(struct socket *so, int keep_lock) mtx_leave(&so->so_snd.sb_mtx); /* - * Regardless on '_locked' postfix, must release solock() before - * call sorflush_locked() for SB_OWNLOCK marked socket. Can't - * release solock() and call sorflush() because solock() release - * is unwanted for tcp(4) socket. + * Unlocked dispose and cleanup is safe. Socket is unlinked + * from everywhere. Even concurrent sotask() thread will not + * call somove(). */ + if (so->so_proto->pr_flags & PR_RIGHTS && + so->so_proto->pr_domain->dom_dispose) + (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); + m_purge(so->so_rcv.sb_mb); - if (so->so_rcv.sb_flags & SB_OWNLOCK) - sounlock(so); - - sorflush_locked(so); - - if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock)) + if (!keep_lock) sounlock(so); #ifdef SOCKET_SPLICE @@ -574,7 +573,7 @@ sosend(struct socket *so, struct mbuf *a size_t resid; int error; int atomic = sosendallatonce(so) || top; - int dosolock = ((so->so_snd.sb_flags & SB_OWNLOCK) == 0); + int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); if (uio) resid = uio->uio_resid; @@ -846,7 +845,7 @@ soreceive(struct socket *so, struct mbuf const struct protosw *pr = so->so_proto; struct mbuf *nextrecord; size_t resid, orig_resid = uio->uio_resid; - int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0); + int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); mp = mp0; if (paddr) @@ -945,7 +944,7 @@ restart: SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); - if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) { sbunlock_locked(so, &so->so_rcv); if (dosolock) sounlock_shared(so); @@ -1247,7 +1246,11 @@ dontblock: SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD) { sb_mtx_unlock(&so->so_rcv); + if (!dosolock) + solock_shared(so); pru_rcvd(so); + if (!dosolock) + sounlock_shared(so); sb_mtx_lock(&so->so_rcv); } } @@ -1306,17 +1309,17 @@ sorflush_locked(struct socket *so) const struct protosw *pr = so->so_proto; int error; - if ((sb->sb_flags & SB_OWNLOCK) == 0) + if ((sb->sb_flags & SB_MTXLOCK) == 0) soassertlocked(so); error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ KASSERT(error == 0); - if (sb->sb_flags & SB_OWNLOCK) + if (sb->sb_flags & SB_MTXLOCK) solock(so); socantrcvmore(so); - if (sb->sb_flags & SB_OWNLOCK) + if (sb->sb_flags & SB_MTXLOCK) sounlock(so); mtx_enter(&sb->sb_mtx); @@ -1334,10 +1337,10 @@ sorflush_locked(struct socket *so) void sorflush(struct socket *so) { - if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) solock_shared(so); sorflush_locked(so); - if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) sounlock_shared(so); } @@ -1383,7 +1386,7 @@ sosplice(struct socket *so, int fd, off_ membar_consumer(); } - if (so->so_rcv.sb_flags & SB_OWNLOCK) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) { if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) return (error); solock(so); @@ -1471,7 +1474,7 @@ sosplice(struct socket *so, int fd, off_ release: sbunlock(sosp, &sosp->so_snd); out: - if (so->so_rcv.sb_flags & SB_OWNLOCK) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) { sounlock(so); sbunlock(so, &so->so_rcv); } else { @@ -1885,7 +1888,8 @@ sorwakeup(struct socket *so) void sowwakeup(struct socket *so) { - soassertlocked_readonly(so); + if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); #ifdef SOCKET_SPLICE if (so->so_snd.sb_flags & SB_SPLICE) @@ -1976,7 +1980,7 @@ sosetopt(struct socket *so, int level, i if ((long)cnt <= 0) cnt = 1; - if (((sb->sb_flags & SB_OWNLOCK) == 0)) + if (((sb->sb_flags & SB_MTXLOCK) == 0)) solock(so); mtx_enter(&sb->sb_mtx); @@ -2003,7 +2007,7 @@ sosetopt(struct socket *so, int level, i } mtx_leave(&sb->sb_mtx); - if (((sb->sb_flags & SB_OWNLOCK) == 0)) + if (((sb->sb_flags & SB_MTXLOCK) == 0)) sounlock(so); break; @@ -2380,7 +2384,8 @@ filt_sowrite(struct knote *kn, long hint int rv; MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); - soassertlocked_readonly(so); + if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); kn->kn_data = sbspace(so, &so->so_snd); if (so->so_snd.sb_state & SS_CANTSENDMORE) { Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v diff -u -p -r1.152 uipc_socket2.c --- sys/kern/uipc_socket2.c 2 May 2024 21:26:52 -0000 1.152 +++ sys/kern/uipc_socket2.c 2 May 2024 22:43:12 -0000 @@ -228,9 +228,10 @@ sonewconn(struct socket *head, int conns */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) goto fail; + + mtx_enter(&head->so_snd.sb_mtx); so->so_snd.sb_wat = head->so_snd.sb_wat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; - mtx_enter(&head->so_snd.sb_mtx); so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; mtx_leave(&head->so_snd.sb_mtx); @@ -543,7 +544,7 @@ sblock(struct socket *so, struct sockbuf { int error = 0, prio = PSOCK; - if (sb->sb_flags & SB_OWNLOCK) { + if (sb->sb_flags & SB_MTXLOCK) { int rwflags = RW_WRITE; if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) @@ -586,7 +587,7 @@ out: void sbunlock_locked(struct socket *so, struct sockbuf *sb) { - if (sb->sb_flags & SB_OWNLOCK) { + if (sb->sb_flags & SB_MTXLOCK) { rw_exit(&sb->sb_lock); return; } @@ -603,7 +604,7 @@ sbunlock_locked(struct socket *so, struc void sbunlock(struct socket *so, struct sockbuf *sb) { - if (sb->sb_flags & SB_OWNLOCK) { + if (sb->sb_flags & SB_MTXLOCK) { rw_exit(&sb->sb_lock); return; } Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v diff -u -p -r1.205 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 2 May 2024 17:10:55 -0000 1.205 +++ sys/kern/uipc_usrreq.c 2 May 2024 22:43:12 -0000 @@ -477,20 +477,24 @@ uipc_dgram_shutdown(struct socket *so) void uipc_rcvd(struct socket *so) { + struct unpcb *unp = sotounpcb(so); struct socket *so2; - if ((so2 = unp_solock_peer(so)) == NULL) + if (unp->unp_conn == NULL) return; + so2 = unp->unp_conn->unp_socket; + /* * Adjust backpressure on sender * and wakeup any waiting to write. */ mtx_enter(&so->so_rcv.sb_mtx); + mtx_enter(&so2->so_snd.sb_mtx); so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; so2->so_snd.sb_cc = so->so_rcv.sb_cc; + mtx_leave(&so2->so_snd.sb_mtx); mtx_leave(&so->so_rcv.sb_mtx); sowwakeup(so2); - sounlock(so2); } int @@ -525,11 +529,17 @@ uipc_send(struct socket *so, struct mbuf * send buffer counts to maintain backpressure. * Wake up readers. */ + /* + * sbappend*() should be serialized together + * with so_snd modification. + */ mtx_enter(&so2->so_rcv.sb_mtx); + mtx_enter(&so->so_snd.sb_mtx); if (control) { if (sbappendcontrol(so2, &so2->so_rcv, m, control)) { control = NULL; } else { + mtx_leave(&so->so_snd.sb_mtx); mtx_leave(&so2->so_rcv.sb_mtx); error = ENOBUFS; goto dispose; @@ -542,6 +552,7 @@ uipc_send(struct socket *so, struct mbuf so->so_snd.sb_cc = so2->so_rcv.sb_cc; if (so2->so_rcv.sb_cc > 0) dowakeup = 1; + mtx_leave(&so->so_snd.sb_mtx); mtx_leave(&so2->so_rcv.sb_mtx); if (dowakeup) Index: sys/miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v diff -u -p -r1.104 fifo_vnops.c --- sys/miscfs/fifofs/fifo_vnops.c 26 Mar 2024 09:46:47 -0000 1.104 +++ sys/miscfs/fifofs/fifo_vnops.c 2 May 2024 22:43:12 -0000 @@ -174,9 +174,12 @@ fifo_open(void *v) return (error); } fip->fi_readers = fip->fi_writers = 0; + /* Take solock() to serialize with uipc_send() */ solock(wso); + mtx_enter(&wso->so_snd.sb_mtx); wso->so_snd.sb_state |= SS_CANTSENDMORE; wso->so_snd.sb_lowat = PIPE_BUF; + mtx_leave(&wso->so_snd.sb_mtx); sounlock(wso); } else { rso = fip->fi_readsock; @@ -185,8 +188,11 @@ fifo_open(void *v) if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { + /* Take solock() to serialize with uipc_send() */ solock(wso); + mtx_enter(&wso->so_snd.sb_mtx); wso->so_snd.sb_state &= ~SS_CANTSENDMORE; + mtx_leave(&wso->so_snd.sb_mtx); sounlock(wso); if (fip->fi_writers > 0) wakeup(&fip->fi_writers); @@ -554,7 +560,6 @@ filt_fifowrite(struct knote *kn, long hi struct socket *so = kn->kn_hook; int rv; - soassertlocked(so); MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); kn->kn_data = sbspace(so, &so->so_snd); @@ -625,11 +630,9 @@ filt_fifowmodify(struct kevent *kev, str struct socket *so = kn->kn_hook; int rv; - solock(so); mtx_enter(&so->so_snd.sb_mtx); rv = knote_modify(kev, kn); mtx_leave(&so->so_snd.sb_mtx); - sounlock(so); return (rv); } @@ -640,11 +643,9 @@ filt_fifowprocess(struct knote *kn, stru struct socket *so = kn->kn_hook; int rv; - solock(so); mtx_enter(&so->so_snd.sb_mtx); rv = knote_process(kn, kev); mtx_leave(&so->so_snd.sb_mtx); - sounlock(so); return (rv); } Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v diff -u -p -r1.129 socketvar.h --- sys/sys/socketvar.h 11 Apr 2024 13:32:51 -0000 1.129 +++ sys/sys/socketvar.h 2 May 2024 22:43:12 -0000 @@ -134,8 +134,7 @@ struct socket { #define SB_ASYNC 0x0010 /* ASYNC I/O, need signals */ #define SB_SPLICE 0x0020 /* buffer is splice source or drain */ #define SB_NOINTR 0x0040 /* operations not interruptible */ -#define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */ -#define SB_OWNLOCK 0x0100 /* sblock() doesn't need solock() */ +#define SB_MTXLOCK 0x0080 /* sblock() doesn't need solock() */ void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */