From: Vitaliy Makkoveev Subject: Re: Don't take solock() in sosend() and soreceive() paths for unix(4) sockets To: tech@openbsd.org Date: Wed, 1 May 2024 01:01:22 +0300 On Tue, Apr 30, 2024 at 11:01:04PM +0300, Vitaliy Makkoveev wrote: > Use `sb_mtx' mutex(9) and `sb_lock' rwlock(9) to protect `so_snd' and > `so_rcv' of unix(4) sockets. > > The transmission of unix(4) sockets already half-unlocked because > connected peer is not locked by solock() during sbappend*() call. Since > the `so_snd' is protected by `sb_mtx' mutex(9) the re-locking is not > required in uipc_rcvd() too. > > SB_OWNLOCK became redundant with SB_MTXLOCK, so remove it. SB_MTXLOCK > was kept because checks against SB_MTXLOCK within sb*() routines look > more consistent to me. > > Please note, the unlocked peer `so2' of unix(4) can't be disconnected > while solock() is held on `so'. That's why unlocked sorwakeup() and > sowwakeup() are fine, corresponding paths will never be followed. > Add forgotten fifofs chunks. Against previous, this diff does direct `so_rcv' dispose and cleanup in sofree(). This sockets is almost dead and unlinked from everywhere include spliced peer. So concurrent sotask() thread will just exit. This required to solve inode and so_rcv locks ordering. Also this removes re-locking from sofree() for all sockets. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.331 uipc_socket.c --- sys/kern/uipc_socket.c 30 Apr 2024 17:59:15 -0000 1.331 +++ sys/kern/uipc_socket.c 30 Apr 2024 21:50:39 -0000 @@ -159,14 +159,15 @@ soalloc(const struct protosw *prp, int w case AF_INET6: switch (prp->pr_type) { case SOCK_RAW: - so->so_snd.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + so->so_snd.sb_flags |= SB_MTXLOCK; /* FALLTHROUGH */ case SOCK_DGRAM: - so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + so->so_rcv.sb_flags |= SB_MTXLOCK; break; } break; case AF_UNIX: + so->so_snd.sb_flags |= SB_MTXLOCK; so->so_rcv.sb_flags |= SB_MTXLOCK; break; } @@ -354,18 +355,16 @@ sofree(struct socket *so, int keep_lock) mtx_leave(&so->so_snd.sb_mtx); /* - * Regardless on '_locked' postfix, must release solock() before - * call sorflush_locked() for SB_OWNLOCK marked socket. Can't - * release solock() and call sorflush() because solock() release - * is unwanted for tcp(4) socket. + * Unlocked dispose and cleanup is safe. Socket is unlinked + * from everywhere. Even concurrent sotask() thread will not + * call somove(). */ + if (so->so_proto->pr_flags & PR_RIGHTS && + so->so_proto->pr_domain->dom_dispose) + (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); + m_purge(so->so_rcv.sb_mb); - if (so->so_rcv.sb_flags & SB_OWNLOCK) - sounlock(so); - - sorflush_locked(so); - - if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock)) + if (!keep_lock) sounlock(so); #ifdef SOCKET_SPLICE @@ -574,7 +573,7 @@ sosend(struct socket *so, struct mbuf *a size_t resid; int error; int atomic = sosendallatonce(so) || top; - int dosolock = ((so->so_snd.sb_flags & SB_OWNLOCK) == 0); + int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); if (uio) resid = uio->uio_resid; @@ -846,7 +845,7 @@ soreceive(struct socket *so, struct mbuf const struct protosw *pr = so->so_proto; struct mbuf *nextrecord; size_t resid, orig_resid = uio->uio_resid; - int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0); + int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); mp = mp0; if (paddr) @@ -945,7 +944,7 @@ restart: SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); - if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) { sbunlock_locked(so, &so->so_rcv); if (dosolock) sounlock_shared(so); @@ -1247,7 +1246,11 @@ dontblock: SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD) { sb_mtx_unlock(&so->so_rcv); + if (!dosolock) + solock_shared(so); pru_rcvd(so); + if (!dosolock) + sounlock_shared(so); sb_mtx_lock(&so->so_rcv); } } @@ -1306,17 +1309,17 @@ sorflush_locked(struct socket *so) const struct protosw *pr = so->so_proto; int error; - if ((sb->sb_flags & SB_OWNLOCK) == 0) + if ((sb->sb_flags & SB_MTXLOCK) == 0) soassertlocked(so); error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ KASSERT(error == 0); - if (sb->sb_flags & SB_OWNLOCK) + if (sb->sb_flags & SB_MTXLOCK) solock(so); socantrcvmore(so); - if (sb->sb_flags & SB_OWNLOCK) + if (sb->sb_flags & SB_MTXLOCK) sounlock(so); mtx_enter(&sb->sb_mtx); @@ -1334,10 +1337,10 @@ sorflush_locked(struct socket *so) void sorflush(struct socket *so) { - if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) solock_shared(so); sorflush_locked(so); - if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) sounlock_shared(so); } @@ -1383,7 +1386,7 @@ sosplice(struct socket *so, int fd, off_ membar_consumer(); } - if (so->so_rcv.sb_flags & SB_OWNLOCK) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) { if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) return (error); solock(so); @@ -1471,7 +1474,7 @@ sosplice(struct socket *so, int fd, off_ release: sbunlock(sosp, &sosp->so_snd); out: - if (so->so_rcv.sb_flags & SB_OWNLOCK) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) { sounlock(so); sbunlock(so, &so->so_rcv); } else { @@ -1885,7 +1888,8 @@ sorwakeup(struct socket *so) void sowwakeup(struct socket *so) { - soassertlocked_readonly(so); + if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); #ifdef SOCKET_SPLICE if (so->so_snd.sb_flags & SB_SPLICE) @@ -1976,7 +1980,7 @@ sosetopt(struct socket *so, int level, i if ((long)cnt <= 0) cnt = 1; - if (((sb->sb_flags & SB_OWNLOCK) == 0)) + if (((sb->sb_flags & SB_MTXLOCK) == 0)) solock(so); mtx_enter(&sb->sb_mtx); @@ -2003,7 +2007,7 @@ sosetopt(struct socket *so, int level, i } mtx_leave(&sb->sb_mtx); - if (((sb->sb_flags & SB_OWNLOCK) == 0)) + if (((sb->sb_flags & SB_MTXLOCK) == 0)) sounlock(so); break; @@ -2380,7 +2384,8 @@ filt_sowrite(struct knote *kn, long hint int rv; MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); - soassertlocked_readonly(so); + if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); kn->kn_data = sbspace(so, &so->so_snd); if (so->so_snd.sb_state & SS_CANTSENDMORE) { Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v diff -u -p -r1.151 uipc_socket2.c --- sys/kern/uipc_socket2.c 30 Apr 2024 17:59:15 -0000 1.151 +++ sys/kern/uipc_socket2.c 30 Apr 2024 21:50:39 -0000 @@ -228,9 +228,10 @@ sonewconn(struct socket *head, int conns */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) goto fail; + + mtx_enter(&head->so_snd.sb_mtx); so->so_snd.sb_wat = head->so_snd.sb_wat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; - mtx_enter(&head->so_snd.sb_mtx); so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; mtx_leave(&head->so_snd.sb_mtx); @@ -334,7 +335,7 @@ socantsendmore(struct socket *so) void socantrcvmore(struct socket *so) { - if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) soassertlocked(so); mtx_enter(&so->so_rcv.sb_mtx); @@ -543,7 +544,7 @@ sblock(struct socket *so, struct sockbuf { int error = 0, prio = PSOCK; - if (sb->sb_flags & SB_OWNLOCK) { + if (sb->sb_flags & SB_MTXLOCK) { int rwflags = RW_WRITE; if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) @@ -586,7 +587,7 @@ out: void sbunlock_locked(struct socket *so, struct sockbuf *sb) { - if (sb->sb_flags & SB_OWNLOCK) { + if (sb->sb_flags & SB_MTXLOCK) { rw_exit(&sb->sb_lock); return; } @@ -603,7 +604,7 @@ sbunlock_locked(struct socket *so, struc void sbunlock(struct socket *so, struct sockbuf *sb) { - if (sb->sb_flags & SB_OWNLOCK) { + if (sb->sb_flags & SB_MTXLOCK) { rw_exit(&sb->sb_lock); return; } Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v diff -u -p -r1.204 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 10 Apr 2024 12:04:41 -0000 1.204 +++ sys/kern/uipc_usrreq.c 30 Apr 2024 21:50:39 -0000 @@ -477,20 +477,24 @@ uipc_dgram_shutdown(struct socket *so) void uipc_rcvd(struct socket *so) { + struct unpcb *unp = sotounpcb(so); struct socket *so2; - if ((so2 = unp_solock_peer(so)) == NULL) + if (unp->unp_conn == NULL) return; + so2 = unp->unp_conn->unp_socket; + /* * Adjust backpressure on sender * and wakeup any waiting to write. */ mtx_enter(&so->so_rcv.sb_mtx); + mtx_enter(&so2->so_snd.sb_mtx); so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; so2->so_snd.sb_cc = so->so_rcv.sb_cc; + mtx_leave(&so2->so_snd.sb_mtx); mtx_leave(&so->so_rcv.sb_mtx); sowwakeup(so2); - sounlock(so2); } int @@ -509,10 +513,6 @@ uipc_send(struct socket *so, struct mbuf goto out; } - if (so->so_snd.sb_state & SS_CANTSENDMORE) { - error = EPIPE; - goto dispose; - } if (unp->unp_conn == NULL) { error = ENOTCONN; goto dispose; @@ -525,11 +525,23 @@ uipc_send(struct socket *so, struct mbuf * send buffer counts to maintain backpressure. * Wake up readers. */ + /* + * sbappend*() should be serialized together + * with so_snd modification. + */ mtx_enter(&so2->so_rcv.sb_mtx); + mtx_enter(&so->so_snd.sb_mtx); + if (so->so_snd.sb_state & SS_CANTSENDMORE) { + mtx_leave(&so->so_snd.sb_mtx); + mtx_leave(&so2->so_rcv.sb_mtx); + error = EPIPE; + goto dispose; + } if (control) { if (sbappendcontrol(so2, &so2->so_rcv, m, control)) { control = NULL; } else { + mtx_leave(&so->so_snd.sb_mtx); mtx_leave(&so2->so_rcv.sb_mtx); error = ENOBUFS; goto dispose; @@ -542,6 +554,7 @@ uipc_send(struct socket *so, struct mbuf so->so_snd.sb_cc = so2->so_rcv.sb_cc; if (so2->so_rcv.sb_cc > 0) dowakeup = 1; + mtx_leave(&so->so_snd.sb_mtx); mtx_leave(&so2->so_rcv.sb_mtx); if (dowakeup) Index: sys/miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v diff -u -p -r1.104 fifo_vnops.c --- sys/miscfs/fifofs/fifo_vnops.c 26 Mar 2024 09:46:47 -0000 1.104 +++ sys/miscfs/fifofs/fifo_vnops.c 30 Apr 2024 21:50:39 -0000 @@ -174,10 +174,10 @@ fifo_open(void *v) return (error); } fip->fi_readers = fip->fi_writers = 0; - solock(wso); + mtx_enter(&wso->so_snd.sb_mtx); wso->so_snd.sb_state |= SS_CANTSENDMORE; wso->so_snd.sb_lowat = PIPE_BUF; - sounlock(wso); + mtx_leave(&wso->so_snd.sb_mtx); } else { rso = fip->fi_readsock; wso = fip->fi_writesock; @@ -185,9 +185,9 @@ fifo_open(void *v) if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { - solock(wso); + mtx_enter(&wso->so_snd.sb_mtx); wso->so_snd.sb_state &= ~SS_CANTSENDMORE; - sounlock(wso); + mtx_leave(&wso->so_snd.sb_mtx); if (fip->fi_writers > 0) wakeup(&fip->fi_writers); } @@ -554,7 +554,6 @@ filt_fifowrite(struct knote *kn, long hi struct socket *so = kn->kn_hook; int rv; - soassertlocked(so); MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); kn->kn_data = sbspace(so, &so->so_snd); @@ -625,11 +624,9 @@ filt_fifowmodify(struct kevent *kev, str struct socket *so = kn->kn_hook; int rv; - solock(so); mtx_enter(&so->so_snd.sb_mtx); rv = knote_modify(kev, kn); mtx_leave(&so->so_snd.sb_mtx); - sounlock(so); return (rv); } @@ -640,11 +637,9 @@ filt_fifowprocess(struct knote *kn, stru struct socket *so = kn->kn_hook; int rv; - solock(so); mtx_enter(&so->so_snd.sb_mtx); rv = knote_process(kn, kev); mtx_leave(&so->so_snd.sb_mtx); - sounlock(so); return (rv); } Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v diff -u -p -r1.129 socketvar.h --- sys/sys/socketvar.h 11 Apr 2024 13:32:51 -0000 1.129 +++ sys/sys/socketvar.h 30 Apr 2024 21:50:39 -0000 @@ -134,8 +134,7 @@ struct socket { #define SB_ASYNC 0x0010 /* ASYNC I/O, need signals */ #define SB_SPLICE 0x0020 /* buffer is splice source or drain */ #define SB_NOINTR 0x0040 /* operations not interruptible */ -#define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */ -#define SB_OWNLOCK 0x0100 /* sblock() doesn't need solock() */ +#define SB_MTXLOCK 0x0080 /* sblock() doesn't need solock() */ void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */