From: Vitaliy Makkoveev Subject: Re: Turn sblock() to `sb_lock' rwlock(9) wrapper for all sockets To: tech@openbsd.org Date: Tue, 7 May 2024 23:23:17 +0300 On Sat, May 04, 2024 at 02:57:00AM +0300, Vitaliy Makkoveev wrote: > sblock() is fine-grained lock which serializes socket send and receive > routines on `so_rcv' or `so_snd' buffers. There is no big problem to > wait netlock while holding sblock(). > > So unify sblock() behaviour to all sockets. Now it should be always > taken before solock() in all involved paths as sosend(), soreceive(), > sorflush() and sosplice() as already done for "unlocked" socket buffers. > > This unification removes a lot of temporary "sb_flags & SB_MTXLOCK" code > from sockets layer. This unification makes straight "solock()" and > "sblock()" lock order, no more solock() -> sblock() -> sounlock() -> > solock() -> sbunlock() -> sounlock() chains in sosend() and soreceive() > paths. This unification brings witness(4) support for sblock(), include > NFS involved sockets, which is useful. > > Since the witness(4) support was introduced to sblock() with this diff, > some new witness reports appeared. That's normal, it's the first time > they were exposed. In addition to [1] "lock order reversal in soreceive > and NFS" thread, I could say that `so_snd' also produces witness > reports during regress/sys/ffs/nfs run. > Updated against last tree changes. In addition, I removed unused `so' argument from sblock()/sbunlock(). Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.333 uipc_socket.c --- sys/kern/uipc_socket.c 3 May 2024 17:43:09 -0000 1.333 +++ sys/kern/uipc_socket.c 7 May 2024 19:27:18 -0000 @@ -66,7 +66,6 @@ void soreaper(void *); void soput(void *); int somove(struct socket *, int); void sorflush(struct socket *); -void sorflush_locked(struct socket *); void filt_sordetach(struct knote *kn); int filt_soread(struct knote *kn, long hint); @@ -606,11 +605,11 @@ sosend(struct socket *so, struct mbuf *a #define snderr(errno) { error = errno; goto release; } - if (dosolock) - solock_shared(so); restart: - if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) + if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; + if (dosolock) + solock_shared(so); sb_mtx_lock(&so->so_snd); so->so_snd.sb_state |= SS_ISSENDING; do { @@ -643,15 +642,12 @@ restart: (atomic || space < so->so_snd.sb_lowat))) { if (flags & MSG_DONTWAIT) snderr(EWOULDBLOCK); - sbunlock(so, &so->so_snd); - - if (so->so_snd.sb_flags & SB_MTXLOCK) - error = sbwait_locked(so, &so->so_snd); - else - error = sbwait(so, &so->so_snd); - + sbunlock(&so->so_snd); + error = sbwait(so, &so->so_snd); so->so_snd.sb_state &= ~SS_ISSENDING; sb_mtx_unlock(&so->so_snd); + if (dosolock) + sounlock_shared(so); if (error) goto out; goto restart; @@ -705,10 +701,10 @@ restart: release: so->so_snd.sb_state &= ~SS_ISSENDING; sb_mtx_unlock(&so->so_snd); - sbunlock(so, &so->so_snd); -out: if (dosolock) sounlock_shared(so); + sbunlock(&so->so_snd); +out: m_freem(top); m_freem(control); return (error); @@ -875,11 +871,11 @@ bad: if (mp) *mp = NULL; +restart: + if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) + return (error); if (dosolock) solock_shared(so); -restart: - if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) - goto out; sb_mtx_lock(&so->so_rcv); m = so->so_rcv.sb_mb; @@ -944,25 +940,13 @@ restart: SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); - if (so->so_rcv.sb_flags & SB_MTXLOCK) { - sbunlock_locked(so, &so->so_rcv); - if (dosolock) - sounlock_shared(so); - error = sbwait_locked(so, &so->so_rcv); - sb_mtx_unlock(&so->so_rcv); - if (error) - return (error); - if (dosolock) - solock_shared(so); - } else { - sb_mtx_unlock(&so->so_rcv); - sbunlock(so, &so->so_rcv); - error = sbwait(so, &so->so_rcv); - if (error) { - sounlock_shared(so); - return (error); - } - } + sbunlock(&so->so_rcv); + error = sbwait(so, &so->so_rcv); + sb_mtx_unlock(&so->so_rcv); + if (dosolock) + sounlock_shared(so); + if (error) + return (error); goto restart; } dontblock: @@ -1202,21 +1186,12 @@ dontblock: break; SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - if (dosolock) { + if (sbwait(so, &so->so_rcv)) { sb_mtx_unlock(&so->so_rcv); - error = sbwait(so, &so->so_rcv); - if (error) { - sbunlock(so, &so->so_rcv); + if (dosolock) sounlock_shared(so); - return (0); - } - sb_mtx_lock(&so->so_rcv); - } else { - if (sbwait_locked(so, &so->so_rcv)) { - sb_mtx_unlock(&so->so_rcv); - sbunlock(so, &so->so_rcv); - return (0); - } + sbunlock(&so->so_rcv); + return (0); } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; @@ -1258,7 +1233,7 @@ dontblock: (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { sb_mtx_unlock(&so->so_rcv); - sbunlock(so, &so->so_rcv); + sbunlock(&so->so_rcv); goto restart; } @@ -1269,10 +1244,9 @@ dontblock: *flagsp |= flags; release: sb_mtx_unlock(&so->so_rcv); - sbunlock(so, &so->so_rcv); -out: if (dosolock) sounlock_shared(so); + sbunlock(&so->so_rcv); return (error); } @@ -1302,48 +1276,33 @@ soshutdown(struct socket *so, int how) } void -sorflush_locked(struct socket *so) +sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct mbuf *m; const struct protosw *pr = so->so_proto; int error; - if ((sb->sb_flags & SB_MTXLOCK) == 0) - soassertlocked(so); - - error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); + error = sblock(sb, SBL_WAIT | SBL_NOINTR); /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ KASSERT(error == 0); - if (sb->sb_flags & SB_MTXLOCK) - solock(so); + solock_shared(so); socantrcvmore(so); - if (sb->sb_flags & SB_MTXLOCK) - sounlock(so); - mtx_enter(&sb->sb_mtx); m = sb->sb_mb; memset(&sb->sb_startzero, 0, (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); sb->sb_timeo_nsecs = INFSLP; mtx_leave(&sb->sb_mtx); - sbunlock(so, sb); + sounlock_shared(so); + sbunlock(sb); + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(m); m_purge(m); } -void -sorflush(struct socket *so) -{ - if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) - solock_shared(so); - sorflush_locked(so); - if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) - sounlock_shared(so); -} - #ifdef SOCKET_SPLICE #define so_splicelen so_sp->ssp_len @@ -1355,7 +1314,7 @@ sorflush(struct socket *so) int sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) { - struct file *fp = NULL; + struct file *fp; struct socket *sosp; struct taskq *tq; int error = 0; @@ -1367,6 +1326,29 @@ sosplice(struct socket *so, int fd, off_ if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) return (EINVAL); + /* If no fd is given, unsplice by removing existing link. */ + if (fd < 0) { + if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) + return (error); + solock(so); + if (so->so_options & SO_ACCEPTCONN) { + error = EOPNOTSUPP; + goto out; + } + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; + goto out; + } + + if (so->so_sp && so->so_sp->ssp_socket) + sounsplice(so, so->so_sp->ssp_socket, 0); + out: + sounlock(so); + sbunlock(&so->so_rcv); + return (error); + } + if (sosplice_taskq == NULL) { rw_enter_write(&sosplice_lock); if (sosplice_taskq == NULL) { @@ -1386,65 +1368,47 @@ sosplice(struct socket *so, int fd, off_ membar_consumer(); } - if (so->so_rcv.sb_flags & SB_MTXLOCK) { - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) - return (error); - solock(so); - } else { - solock(so); - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { - sounlock(so); - return (error); - } - } - - if (so->so_options & SO_ACCEPTCONN) { - error = EOPNOTSUPP; - goto out; - } - if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && - (so->so_proto->pr_flags & PR_CONNREQUIRED)) { - error = ENOTCONN; - goto out; - } - if (so->so_sp == NULL) - so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); - - /* If no fd is given, unsplice by removing existing link. */ - if (fd < 0) { - if (so->so_sp->ssp_socket) - sounsplice(so, so->so_sp->ssp_socket, 0); - goto out; - } - /* Find sosp, the drain socket where data will be spliced into. */ if ((error = getsock(curproc, fd, &fp)) != 0) - goto out; + return (error); sosp = fp->f_data; + if (sosp->so_proto->pr_usrreqs->pru_send != so->so_proto->pr_usrreqs->pru_send) { error = EPROTONOSUPPORT; - goto out; + goto frele; } - if (sosp->so_sp == NULL) - sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); - if ((error = sblock(sosp, &sosp->so_snd, SBL_WAIT)) != 0) { - goto out; + if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) + goto frele; + if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { + sbunlock(&so->so_rcv); + goto frele; } + solock(so); - if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { - error = EBUSY; + if ((so->so_options & SO_ACCEPTCONN) || + (sosp->so_options & SO_ACCEPTCONN)) { + error = EOPNOTSUPP; goto release; } - if (sosp->so_options & SO_ACCEPTCONN) { - error = EOPNOTSUPP; + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; goto release; } if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { error = ENOTCONN; goto release; } + if (so->so_sp == NULL) + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); + if (sosp->so_sp == NULL) + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); + if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { + error = EBUSY; + goto release; + } /* Splice so and sosp together. */ mtx_enter(&so->so_rcv.sb_mtx); @@ -1472,18 +1436,11 @@ sosplice(struct socket *so, int fd, off_ } release: - sbunlock(sosp, &sosp->so_snd); - out: - if (so->so_rcv.sb_flags & SB_MTXLOCK) { - sounlock(so); - sbunlock(so, &so->so_rcv); - } else { - sbunlock(so, &so->so_rcv); - sounlock(so); - } - - if (fp) - FRELE(fp, curproc); + sounlock(so); + sbunlock(&sosp->so_snd); + sbunlock(&so->so_rcv); + frele: + FRELE(fp, curproc); return (error); } Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v diff -u -p -r1.154 uipc_socket2.c --- sys/kern/uipc_socket2.c 7 May 2024 15:54:23 -0000 1.154 +++ sys/kern/uipc_socket2.c 7 May 2024 19:27:19 -0000 @@ -512,23 +512,19 @@ sbmtxassertlocked(struct socket *so, str * Wait for data to arrive at/drain from a socket buffer. */ int -sbwait_locked(struct socket *so, struct sockbuf *sb) -{ - int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; - - MUTEX_ASSERT_LOCKED(&sb->sb_mtx); - - sb->sb_flags |= SB_WAIT; - return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", - sb->sb_timeo_nsecs); -} - -int sbwait(struct socket *so, struct sockbuf *sb) { uint64_t timeo_nsecs; int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; + if (sb->sb_flags & SB_MTXLOCK) { + MUTEX_ASSERT_LOCKED(&sb->sb_mtx); + + sb->sb_flags |= SB_WAIT; + return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", + sb->sb_timeo_nsecs); + } + soassertlocked(so); mtx_enter(&sb->sb_mtx); @@ -540,81 +536,26 @@ sbwait(struct socket *so, struct sockbuf } int -sblock(struct socket *so, struct sockbuf *sb, int flags) +sblock(struct sockbuf *sb, int flags) { - int error = 0, prio = PSOCK; - - if (sb->sb_flags & SB_MTXLOCK) { - int rwflags = RW_WRITE; - - if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) - rwflags |= RW_INTR; - if (!(flags & SBL_WAIT)) - rwflags |= RW_NOSLEEP; - - error = rw_enter(&sb->sb_lock, rwflags); - if (error == EBUSY) - error = EWOULDBLOCK; - return error; - } + int rwflags = RW_WRITE, error; - soassertlocked(so); - - mtx_enter(&sb->sb_mtx); - if ((sb->sb_flags & SB_LOCK) == 0) { - sb->sb_flags |= SB_LOCK; - goto out; - } - if ((flags & SBL_WAIT) == 0) { - error = EWOULDBLOCK; - goto out; - } if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) - prio |= PCATCH; - - while (sb->sb_flags & SB_LOCK) { - sb->sb_flags |= SB_WANT; - mtx_leave(&sb->sb_mtx); - error = sosleep_nsec(so, &sb->sb_flags, prio, "sblock", INFSLP); - if (error) - return (error); - mtx_enter(&sb->sb_mtx); - } - sb->sb_flags |= SB_LOCK; -out: - mtx_leave(&sb->sb_mtx); - - return (error); -} - -void -sbunlock_locked(struct socket *so, struct sockbuf *sb) -{ - if (sb->sb_flags & SB_MTXLOCK) { - rw_exit(&sb->sb_lock); - return; - } + rwflags |= RW_INTR; + if (!(flags & SBL_WAIT)) + rwflags |= RW_NOSLEEP; - MUTEX_ASSERT_LOCKED(&sb->sb_mtx); + error = rw_enter(&sb->sb_lock, rwflags); + if (error == EBUSY) + error = EWOULDBLOCK; - sb->sb_flags &= ~SB_LOCK; - if (sb->sb_flags & SB_WANT) { - sb->sb_flags &= ~SB_WANT; - wakeup(&sb->sb_flags); - } + return error; } void -sbunlock(struct socket *so, struct sockbuf *sb) +sbunlock(struct sockbuf *sb) { - if (sb->sb_flags & SB_MTXLOCK) { - rw_exit(&sb->sb_lock); - return; - } - - mtx_enter(&sb->sb_mtx); - sbunlock_locked(so, sb); - mtx_leave(&sb->sb_mtx); + rw_exit(&sb->sb_lock); } /* @@ -1128,7 +1069,7 @@ void sbflush(struct socket *so, struct sockbuf *sb) { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - KASSERT((sb->sb_flags & SB_LOCK) == 0); + rw_assert_unlocked(&sb->sb_lock); while (sb->sb_mbcnt) sbdrop(so, sb, (int)sb->sb_cc); Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v diff -u -p -r1.130 socketvar.h --- sys/sys/socketvar.h 3 May 2024 17:43:09 -0000 1.130 +++ sys/sys/socketvar.h 7 May 2024 19:27:20 -0000 @@ -128,13 +128,11 @@ struct socket { struct klist sb_klist; /* process selecting read/write */ } so_rcv, so_snd; #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ -#define SB_LOCK 0x0001 /* lock on data queue */ -#define SB_WANT 0x0002 /* someone is waiting to lock */ -#define SB_WAIT 0x0004 /* someone is waiting for data/space */ -#define SB_ASYNC 0x0010 /* ASYNC I/O, need signals */ -#define SB_SPLICE 0x0020 /* buffer is splice source or drain */ -#define SB_NOINTR 0x0040 /* operations not interruptible */ -#define SB_MTXLOCK 0x0080 /* sblock() doesn't need solock() */ +#define SB_WAIT 0x0001 /* someone is waiting for data/space */ +#define SB_ASYNC 0x0002 /* ASYNC I/O, need signals */ +#define SB_SPLICE 0x0004 /* buffer is splice source or drain */ +#define SB_NOINTR 0x0008 /* operations not interruptible */ +#define SB_MTXLOCK 0x0010 /* sblock() doesn't need solock() */ void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ @@ -315,11 +313,10 @@ sbfree(struct socket *so, struct sockbuf * sleep is interruptible. Returns error without lock if * sleep is interrupted. */ -int sblock(struct socket *, struct sockbuf *, int); +int sblock(struct sockbuf *, int); /* release lock on sockbuf sb */ -void sbunlock(struct socket *, struct sockbuf *); -void sbunlock_locked(struct socket *, struct sockbuf *); +void sbunlock(struct sockbuf *); #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ @@ -367,7 +364,6 @@ int sbcheckreserve(u_long, u_long); int sbchecklowmem(void); int sbreserve(struct socket *, struct sockbuf *, u_long); int sbwait(struct socket *, struct sockbuf *); -int sbwait_locked(struct socket *, struct sockbuf *); void soinit(void); void soabort(struct socket *); int soaccept(struct socket *, struct mbuf *);