From: Alexander Bluhm Subject: Re: Turn sblock() to `sb_lock' rwlock(9) wrapper for all sockets To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Fri, 17 May 2024 19:20:08 +0200 On Tue, May 07, 2024 at 11:23:17PM +0300, Vitaliy Makkoveev wrote: > On Sat, May 04, 2024 at 02:57:00AM +0300, Vitaliy Makkoveev wrote: > > sblock() is fine-grained lock which serializes socket send and receive > > routines on `so_rcv' or `so_snd' buffers. There is no big problem to > > wait netlock while holding sblock(). > > > > So unify sblock() behaviour to all sockets. Now it should be always > > taken before solock() in all involved paths as sosend(), soreceive(), > > sorflush() and sosplice() as already done for "unlocked" socket buffers. > > > > This unification removes a lot of temporary "sb_flags & SB_MTXLOCK" code > > from sockets layer. This unification makes straight "solock()" and > > "sblock()" lock order, no more solock() -> sblock() -> sounlock() -> > > solock() -> sbunlock() -> sounlock() chains in sosend() and soreceive() > > paths. This unification brings witness(4) support for sblock(), include > > NFS involved sockets, which is useful. > > > > Since the witness(4) support was introduced to sblock() with this diff, > > some new witness reports appeared. That's normal, it's the first time > > they were exposed. In addition to [1] "lock order reversal in soreceive > > and NFS" thread, I could say that `so_snd' also produces witness > > reports during regress/sys/ffs/nfs run. > > > > Updated against last tree changes. In addition, I removed unused `so' > argument from sblock()/sbunlock(). OK bluhm@ > Index: sys/kern/uipc_socket.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket.c,v > diff -u -p -r1.333 uipc_socket.c > --- sys/kern/uipc_socket.c 3 May 2024 17:43:09 -0000 1.333 > +++ sys/kern/uipc_socket.c 7 May 2024 19:27:18 -0000 > @@ -66,7 +66,6 @@ void soreaper(void *); > void soput(void *); > int somove(struct socket *, int); > void sorflush(struct socket *); > -void sorflush_locked(struct socket *); > > void filt_sordetach(struct knote *kn); > int filt_soread(struct knote *kn, long hint); > @@ -606,11 +605,11 @@ sosend(struct socket *so, struct mbuf *a > > #define snderr(errno) { error = errno; goto release; } > > - if (dosolock) > - solock_shared(so); > restart: > - if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) > + if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) > goto out; > + if (dosolock) > + solock_shared(so); > sb_mtx_lock(&so->so_snd); > so->so_snd.sb_state |= SS_ISSENDING; > do { > @@ -643,15 +642,12 @@ restart: > (atomic || space < so->so_snd.sb_lowat))) { > if (flags & MSG_DONTWAIT) > snderr(EWOULDBLOCK); > - sbunlock(so, &so->so_snd); > - > - if (so->so_snd.sb_flags & SB_MTXLOCK) > - error = sbwait_locked(so, &so->so_snd); > - else > - error = sbwait(so, &so->so_snd); > - > + sbunlock(&so->so_snd); > + error = sbwait(so, &so->so_snd); > so->so_snd.sb_state &= ~SS_ISSENDING; > sb_mtx_unlock(&so->so_snd); > + if (dosolock) > + sounlock_shared(so); > if (error) > goto out; > goto restart; > @@ -705,10 +701,10 @@ restart: > release: > so->so_snd.sb_state &= ~SS_ISSENDING; > sb_mtx_unlock(&so->so_snd); > - sbunlock(so, &so->so_snd); > -out: > if (dosolock) > sounlock_shared(so); > + sbunlock(&so->so_snd); > +out: > m_freem(top); > m_freem(control); > return (error); > @@ -875,11 +871,11 @@ bad: > if (mp) > *mp = NULL; > > +restart: > + if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) > + return (error); > if (dosolock) > solock_shared(so); > -restart: > - if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) > - goto out; > sb_mtx_lock(&so->so_rcv); > > m = so->so_rcv.sb_mb; > @@ -944,25 +940,13 @@ restart: > SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); > SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); > > - if (so->so_rcv.sb_flags & SB_MTXLOCK) { > - sbunlock_locked(so, &so->so_rcv); > - if (dosolock) > - sounlock_shared(so); > - error = sbwait_locked(so, &so->so_rcv); > - sb_mtx_unlock(&so->so_rcv); > - if (error) > - return (error); > - if (dosolock) > - solock_shared(so); > - } else { > - sb_mtx_unlock(&so->so_rcv); > - sbunlock(so, &so->so_rcv); > - error = sbwait(so, &so->so_rcv); > - if (error) { > - sounlock_shared(so); > - return (error); > - } > - } > + sbunlock(&so->so_rcv); > + error = sbwait(so, &so->so_rcv); > + sb_mtx_unlock(&so->so_rcv); > + if (dosolock) > + sounlock_shared(so); > + if (error) > + return (error); > goto restart; > } > dontblock: > @@ -1202,21 +1186,12 @@ dontblock: > break; > SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); > SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); > - if (dosolock) { > + if (sbwait(so, &so->so_rcv)) { > sb_mtx_unlock(&so->so_rcv); > - error = sbwait(so, &so->so_rcv); > - if (error) { > - sbunlock(so, &so->so_rcv); > + if (dosolock) > sounlock_shared(so); > - return (0); > - } > - sb_mtx_lock(&so->so_rcv); > - } else { > - if (sbwait_locked(so, &so->so_rcv)) { > - sb_mtx_unlock(&so->so_rcv); > - sbunlock(so, &so->so_rcv); > - return (0); > - } > + sbunlock(&so->so_rcv); > + return (0); > } > if ((m = so->so_rcv.sb_mb) != NULL) > nextrecord = m->m_nextpkt; > @@ -1258,7 +1233,7 @@ dontblock: > (flags & MSG_EOR) == 0 && > (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { > sb_mtx_unlock(&so->so_rcv); > - sbunlock(so, &so->so_rcv); > + sbunlock(&so->so_rcv); > goto restart; > } > > @@ -1269,10 +1244,9 @@ dontblock: > *flagsp |= flags; > release: > sb_mtx_unlock(&so->so_rcv); > - sbunlock(so, &so->so_rcv); > -out: > if (dosolock) > sounlock_shared(so); > + sbunlock(&so->so_rcv); > return (error); > } > > @@ -1302,48 +1276,33 @@ soshutdown(struct socket *so, int how) > } > > void > -sorflush_locked(struct socket *so) > +sorflush(struct socket *so) > { > struct sockbuf *sb = &so->so_rcv; > struct mbuf *m; > const struct protosw *pr = so->so_proto; > int error; > > - if ((sb->sb_flags & SB_MTXLOCK) == 0) > - soassertlocked(so); > - > - error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); > + error = sblock(sb, SBL_WAIT | SBL_NOINTR); > /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ > KASSERT(error == 0); > > - if (sb->sb_flags & SB_MTXLOCK) > - solock(so); > + solock_shared(so); > socantrcvmore(so); > - if (sb->sb_flags & SB_MTXLOCK) > - sounlock(so); > - > mtx_enter(&sb->sb_mtx); > m = sb->sb_mb; > memset(&sb->sb_startzero, 0, > (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); > sb->sb_timeo_nsecs = INFSLP; > mtx_leave(&sb->sb_mtx); > - sbunlock(so, sb); > + sounlock_shared(so); > + sbunlock(sb); > + > if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) > (*pr->pr_domain->dom_dispose)(m); > m_purge(m); > } > > -void > -sorflush(struct socket *so) > -{ > - if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) > - solock_shared(so); > - sorflush_locked(so); > - if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) > - sounlock_shared(so); > -} > - > #ifdef SOCKET_SPLICE > > #define so_splicelen so_sp->ssp_len > @@ -1355,7 +1314,7 @@ sorflush(struct socket *so) > int > sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) > { > - struct file *fp = NULL; > + struct file *fp; > struct socket *sosp; > struct taskq *tq; > int error = 0; > @@ -1367,6 +1326,29 @@ sosplice(struct socket *so, int fd, off_ > if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) > return (EINVAL); > > + /* If no fd is given, unsplice by removing existing link. */ > + if (fd < 0) { > + if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) > + return (error); > + solock(so); > + if (so->so_options & SO_ACCEPTCONN) { > + error = EOPNOTSUPP; > + goto out; > + } > + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && > + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { > + error = ENOTCONN; > + goto out; > + } > + > + if (so->so_sp && so->so_sp->ssp_socket) > + sounsplice(so, so->so_sp->ssp_socket, 0); > + out: > + sounlock(so); > + sbunlock(&so->so_rcv); > + return (error); > + } > + > if (sosplice_taskq == NULL) { > rw_enter_write(&sosplice_lock); > if (sosplice_taskq == NULL) { > @@ -1386,65 +1368,47 @@ sosplice(struct socket *so, int fd, off_ > membar_consumer(); > } > > - if (so->so_rcv.sb_flags & SB_MTXLOCK) { > - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) > - return (error); > - solock(so); > - } else { > - solock(so); > - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { > - sounlock(so); > - return (error); > - } > - } > - > - if (so->so_options & SO_ACCEPTCONN) { > - error = EOPNOTSUPP; > - goto out; > - } > - if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && > - (so->so_proto->pr_flags & PR_CONNREQUIRED)) { > - error = ENOTCONN; > - goto out; > - } > - if (so->so_sp == NULL) > - so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > - > - /* If no fd is given, unsplice by removing existing link. */ > - if (fd < 0) { > - if (so->so_sp->ssp_socket) > - sounsplice(so, so->so_sp->ssp_socket, 0); > - goto out; > - } > - > /* Find sosp, the drain socket where data will be spliced into. */ > if ((error = getsock(curproc, fd, &fp)) != 0) > - goto out; > + return (error); > sosp = fp->f_data; > + > if (sosp->so_proto->pr_usrreqs->pru_send != > so->so_proto->pr_usrreqs->pru_send) { > error = EPROTONOSUPPORT; > - goto out; > + goto frele; > } > - if (sosp->so_sp == NULL) > - sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > > - if ((error = sblock(sosp, &sosp->so_snd, SBL_WAIT)) != 0) { > - goto out; > + if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) > + goto frele; > + if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { > + sbunlock(&so->so_rcv); > + goto frele; > } > + solock(so); > > - if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { > - error = EBUSY; > + if ((so->so_options & SO_ACCEPTCONN) || > + (sosp->so_options & SO_ACCEPTCONN)) { > + error = EOPNOTSUPP; > goto release; > } > - if (sosp->so_options & SO_ACCEPTCONN) { > - error = EOPNOTSUPP; > + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && > + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { > + error = ENOTCONN; > goto release; > } > if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { > error = ENOTCONN; > goto release; > } > + if (so->so_sp == NULL) > + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > + if (sosp->so_sp == NULL) > + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > + if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { > + error = EBUSY; > + goto release; > + } > > /* Splice so and sosp together. */ > mtx_enter(&so->so_rcv.sb_mtx); > @@ -1472,18 +1436,11 @@ sosplice(struct socket *so, int fd, off_ > } > > release: > - sbunlock(sosp, &sosp->so_snd); > - out: > - if (so->so_rcv.sb_flags & SB_MTXLOCK) { > - sounlock(so); > - sbunlock(so, &so->so_rcv); > - } else { > - sbunlock(so, &so->so_rcv); > - sounlock(so); > - } > - > - if (fp) > - FRELE(fp, curproc); > + sounlock(so); > + sbunlock(&sosp->so_snd); > + sbunlock(&so->so_rcv); > + frele: > + FRELE(fp, curproc); > > return (error); > } > Index: sys/kern/uipc_socket2.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket2.c,v > diff -u -p -r1.154 uipc_socket2.c > --- sys/kern/uipc_socket2.c 7 May 2024 15:54:23 -0000 1.154 > +++ sys/kern/uipc_socket2.c 7 May 2024 19:27:19 -0000 > @@ -512,23 +512,19 @@ sbmtxassertlocked(struct socket *so, str > * Wait for data to arrive at/drain from a socket buffer. > */ > int > -sbwait_locked(struct socket *so, struct sockbuf *sb) > -{ > - int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; > - > - MUTEX_ASSERT_LOCKED(&sb->sb_mtx); > - > - sb->sb_flags |= SB_WAIT; > - return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", > - sb->sb_timeo_nsecs); > -} > - > -int > sbwait(struct socket *so, struct sockbuf *sb) > { > uint64_t timeo_nsecs; > int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; > > + if (sb->sb_flags & SB_MTXLOCK) { > + MUTEX_ASSERT_LOCKED(&sb->sb_mtx); > + > + sb->sb_flags |= SB_WAIT; > + return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", > + sb->sb_timeo_nsecs); > + } > + > soassertlocked(so); > > mtx_enter(&sb->sb_mtx); > @@ -540,81 +536,26 @@ sbwait(struct socket *so, struct sockbuf > } > > int > -sblock(struct socket *so, struct sockbuf *sb, int flags) > +sblock(struct sockbuf *sb, int flags) > { > - int error = 0, prio = PSOCK; > - > - if (sb->sb_flags & SB_MTXLOCK) { > - int rwflags = RW_WRITE; > - > - if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) > - rwflags |= RW_INTR; > - if (!(flags & SBL_WAIT)) > - rwflags |= RW_NOSLEEP; > - > - error = rw_enter(&sb->sb_lock, rwflags); > - if (error == EBUSY) > - error = EWOULDBLOCK; > - return error; > - } > + int rwflags = RW_WRITE, error; > > - soassertlocked(so); > - > - mtx_enter(&sb->sb_mtx); > - if ((sb->sb_flags & SB_LOCK) == 0) { > - sb->sb_flags |= SB_LOCK; > - goto out; > - } > - if ((flags & SBL_WAIT) == 0) { > - error = EWOULDBLOCK; > - goto out; > - } > if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) > - prio |= PCATCH; > - > - while (sb->sb_flags & SB_LOCK) { > - sb->sb_flags |= SB_WANT; > - mtx_leave(&sb->sb_mtx); > - error = sosleep_nsec(so, &sb->sb_flags, prio, "sblock", INFSLP); > - if (error) > - return (error); > - mtx_enter(&sb->sb_mtx); > - } > - sb->sb_flags |= SB_LOCK; > -out: > - mtx_leave(&sb->sb_mtx); > - > - return (error); > -} > - > -void > -sbunlock_locked(struct socket *so, struct sockbuf *sb) > -{ > - if (sb->sb_flags & SB_MTXLOCK) { > - rw_exit(&sb->sb_lock); > - return; > - } > + rwflags |= RW_INTR; > + if (!(flags & SBL_WAIT)) > + rwflags |= RW_NOSLEEP; > > - MUTEX_ASSERT_LOCKED(&sb->sb_mtx); > + error = rw_enter(&sb->sb_lock, rwflags); > + if (error == EBUSY) > + error = EWOULDBLOCK; > > - sb->sb_flags &= ~SB_LOCK; > - if (sb->sb_flags & SB_WANT) { > - sb->sb_flags &= ~SB_WANT; > - wakeup(&sb->sb_flags); > - } > + return error; > } > > void > -sbunlock(struct socket *so, struct sockbuf *sb) > +sbunlock(struct sockbuf *sb) > { > - if (sb->sb_flags & SB_MTXLOCK) { > - rw_exit(&sb->sb_lock); > - return; > - } > - > - mtx_enter(&sb->sb_mtx); > - sbunlock_locked(so, sb); > - mtx_leave(&sb->sb_mtx); > + rw_exit(&sb->sb_lock); > } > > /* > @@ -1128,7 +1069,7 @@ void > sbflush(struct socket *so, struct sockbuf *sb) > { > KASSERT(sb == &so->so_rcv || sb == &so->so_snd); > - KASSERT((sb->sb_flags & SB_LOCK) == 0); > + rw_assert_unlocked(&sb->sb_lock); > > while (sb->sb_mbcnt) > sbdrop(so, sb, (int)sb->sb_cc); > Index: sys/sys/socketvar.h > =================================================================== > RCS file: /cvs/src/sys/sys/socketvar.h,v > diff -u -p -r1.130 socketvar.h > --- sys/sys/socketvar.h 3 May 2024 17:43:09 -0000 1.130 > +++ sys/sys/socketvar.h 7 May 2024 19:27:20 -0000 > @@ -128,13 +128,11 @@ struct socket { > struct klist sb_klist; /* process selecting read/write */ > } so_rcv, so_snd; > #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ > -#define SB_LOCK 0x0001 /* lock on data queue */ > -#define SB_WANT 0x0002 /* someone is waiting to lock */ > -#define SB_WAIT 0x0004 /* someone is waiting for data/space */ > -#define SB_ASYNC 0x0010 /* ASYNC I/O, need signals */ > -#define SB_SPLICE 0x0020 /* buffer is splice source or drain */ > -#define SB_NOINTR 0x0040 /* operations not interruptible */ > -#define SB_MTXLOCK 0x0080 /* sblock() doesn't need solock() */ > +#define SB_WAIT 0x0001 /* someone is waiting for data/space */ > +#define SB_ASYNC 0x0002 /* ASYNC I/O, need signals */ > +#define SB_SPLICE 0x0004 /* buffer is splice source or drain */ > +#define SB_NOINTR 0x0008 /* operations not interruptible */ > +#define SB_MTXLOCK 0x0010 /* sblock() doesn't need solock() */ > > void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); > caddr_t so_upcallarg; /* Arg for above */ > @@ -315,11 +313,10 @@ sbfree(struct socket *so, struct sockbuf > * sleep is interruptible. Returns error without lock if > * sleep is interrupted. > */ > -int sblock(struct socket *, struct sockbuf *, int); > +int sblock(struct sockbuf *, int); > > /* release lock on sockbuf sb */ > -void sbunlock(struct socket *, struct sockbuf *); > -void sbunlock_locked(struct socket *, struct sockbuf *); > +void sbunlock(struct sockbuf *); > > #define SB_EMPTY_FIXUP(sb) do { \ > if ((sb)->sb_mb == NULL) { \ > @@ -367,7 +364,6 @@ int sbcheckreserve(u_long, u_long); > int sbchecklowmem(void); > int sbreserve(struct socket *, struct sockbuf *, u_long); > int sbwait(struct socket *, struct sockbuf *); > -int sbwait_locked(struct socket *, struct sockbuf *); > void soinit(void); > void soabort(struct socket *); > int soaccept(struct socket *, struct mbuf *);