From: Alexander Bluhm Subject: Re: Don't take solock() in soreceive() for SOCK_RAW inet sockets To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Wed, 3 Apr 2024 21:57:09 +0200 On Sun, Mar 31, 2024 at 11:53:02PM +0300, Vitaliy Makkoveev wrote: > Reduce netlock pressure for inet sockets. > > These sockets are not connection oriented, they don't call pru_rcvd(), > they can't be spliced, they don't set `so_error'. Nothing to protect > with solock() in soreceive(), filt_soread() and filt_soexcept() paths. > > `so_rcv' buffer protected by `sb_mtx' mutex(9), but since it released in > soreceive() path, sblock() required to serialize concurrent soreceive() > and sorflush() threads. Current sblock() is some kind of rwlock(9) > implementation, so introduce `sb_lock' and it directly for that purpose. > > The sorflush() and callers were refactored to avoid solock() for raw > inet sockets. This done to avoid packet processing stop. > > The new SB_STANDALONE flag is partially redundant WITH SB_OWNLOCK, but > they will be merged in one flag later. Passed regress with witness. But it found splassert: splassert: soassertlocked: want 1 have 0 Starting stack trace... soassertlocked(1) at soassertlocked+0xc7 splassert: soassertlocked: want 1 have 0 Parallel traceback, suppressed... soassertlocked(d782f188) at soassertlocked+0xc7 sbreserve(d782f188,d782f270,10000) at sbreserve+0x19 sosetopt(d782f188,ffff,1001,dabe6400) at sosetopt+0x295 sys_setsockopt(d7709cb8,f5fbf850,f5fbf848) at sys_setsockopt+0x126 syscall(f5fbf890) at syscall+0x41d Xsyscall_untramp() at Xsyscall_untramp+0xa9 I am not a big fan of another SB_ flag. It is hard enough to understand the others. I have not done a propper review yet. bluhm > Index: sys/kern/uipc_socket.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket.c,v > retrieving revision 1.325 > diff -u -p -r1.325 uipc_socket.c > --- sys/kern/uipc_socket.c 31 Mar 2024 14:01:28 -0000 1.325 > +++ sys/kern/uipc_socket.c 31 Mar 2024 20:30:10 -0000 > @@ -142,6 +142,8 @@ soalloc(const struct protosw *prp, int w > return (NULL); > rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); > refcnt_init(&so->so_refcnt); > + rw_init(&so->so_rcv.sb_lock, "sbufrcv"); > + rw_init(&so->so_snd.sb_lock, "sbufsnd"); > mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR); > mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR); > klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); > @@ -155,10 +157,10 @@ soalloc(const struct protosw *prp, int w > case AF_INET6: > switch (prp->pr_type) { > case SOCK_DGRAM: > - so->so_rcv.sb_flags |= SB_OWNLOCK; > - /* FALLTHROUGH */ > + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; > + break; > case SOCK_RAW: > - so->so_rcv.sb_flags |= SB_MTXLOCK; > + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_STANDALONE; > break; > } > break; > @@ -345,9 +347,22 @@ sofree(struct socket *so, int keep_lock) > } > #endif /* SOCKET_SPLICE */ > sbrelease(so, &so->so_snd); > - sorflush(so); > - if (!keep_lock) > + > + /* > + * Regardless on '_locked' postfix, must release solock() before > + * call sorflush_locked() for SB_STANDALONE marked socket. Can't > + * release solock() and call sorflush() because solock() release > + * is unwanted for tcp(4) socket. > + */ > + > + if (so->so_rcv.sb_flags & SB_STANDALONE) > sounlock(so); > + > + sorflush_locked(so); > + > + if (!((so->so_rcv.sb_flags & SB_STANDALONE) || keep_lock)) > + sounlock(so); > + > #ifdef SOCKET_SPLICE > if (so->so_sp) { > /* Reuse splice idle, sounsplice() has been called before. */ > @@ -815,6 +830,7 @@ soreceive(struct socket *so, struct mbuf > const struct protosw *pr = so->so_proto; > struct mbuf *nextrecord; > size_t resid, orig_resid = uio->uio_resid; > + int dosolock = ((so->so_rcv.sb_flags & SB_STANDALONE) == 0); > > mp = mp0; > if (paddr) > @@ -844,12 +860,11 @@ bad: > if (mp) > *mp = NULL; > > - solock_shared(so); > + if (dosolock) > + solock_shared(so); > restart: > - if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { > - sounlock_shared(so); > - return (error); > - } > + if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) > + goto out; > sb_mtx_lock(&so->so_rcv); > > m = so->so_rcv.sb_mb; > @@ -914,14 +929,16 @@ restart: > SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); > SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); > > - if (so->so_rcv.sb_flags & SB_OWNLOCK) { > + if (so->so_rcv.sb_flags & (SB_OWNLOCK | SB_STANDALONE)) { > sbunlock_locked(so, &so->so_rcv); > - sounlock_shared(so); > + if (dosolock) > + sounlock_shared(so); > error = sbwait_locked(so, &so->so_rcv); > sb_mtx_unlock(&so->so_rcv); > if (error) > return (error); > - solock_shared(so); > + if (dosolock) > + solock_shared(so); > } else { > sb_mtx_unlock(&so->so_rcv); > sbunlock(so, &so->so_rcv); > @@ -998,11 +1015,13 @@ dontblock: > if (controlp) { > if (pr->pr_domain->dom_externalize) { > sb_mtx_unlock(&so->so_rcv); > - sounlock_shared(so); > + if (dosolock) > + sounlock_shared(so); > error = > (*pr->pr_domain->dom_externalize) > (cm, controllen, flags); > - solock_shared(so); > + if (dosolock) > + solock_shared(so); > sb_mtx_lock(&so->so_rcv); > } > *controlp = cm; > @@ -1081,9 +1100,11 @@ dontblock: > SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); > resid = uio->uio_resid; > sb_mtx_unlock(&so->so_rcv); > - sounlock_shared(so); > + if (dosolock) > + sounlock_shared(so); > uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); > - solock_shared(so); > + if (dosolock) > + solock_shared(so); > sb_mtx_lock(&so->so_rcv); > if (uio_error) > uio->uio_resid = resid - len; > @@ -1166,14 +1187,21 @@ dontblock: > break; > SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); > SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); > - sb_mtx_unlock(&so->so_rcv); > - error = sbwait(so, &so->so_rcv); > - if (error) { > - sbunlock(so, &so->so_rcv); > - sounlock_shared(so); > - return (0); > + if (dosolock) { > + sb_mtx_unlock(&so->so_rcv); > + error = sbwait(so, &so->so_rcv); > + if (error) { > + sbunlock(so, &so->so_rcv); > + sounlock_shared(so); > + return (0); > + } > + sb_mtx_lock(&so->so_rcv); > + } else { > + if (sbwait_locked(so, &so->so_rcv)) { > + error = 0; > + goto release; > + } > } > - sb_mtx_lock(&so->so_rcv); > if ((m = so->so_rcv.sb_mb) != NULL) > nextrecord = m->m_nextpkt; > } > @@ -1222,7 +1250,9 @@ dontblock: > release: > sb_mtx_unlock(&so->so_rcv); > sbunlock(so, &so->so_rcv); > - sounlock_shared(so); > +out: > + if (dosolock) > + sounlock_shared(so); > return (error); > } > > @@ -1231,7 +1261,6 @@ soshutdown(struct socket *so, int how) > { > int error = 0; > > - solock(so); > switch (how) { > case SHUT_RD: > sorflush(so); > @@ -1240,25 +1269,29 @@ soshutdown(struct socket *so, int how) > sorflush(so); > /* FALLTHROUGH */ > case SHUT_WR: > + solock(so); > error = pru_shutdown(so); > + sounlock(so); > break; > default: > error = EINVAL; > break; > } > - sounlock(so); > > return (error); > } > > void > -sorflush(struct socket *so) > +sorflush_locked(struct socket *so) > { > struct sockbuf *sb = &so->so_rcv; > struct mbuf *m; > const struct protosw *pr = so->so_proto; > int error; > > + if ((sb->sb_flags & SB_STANDALONE) == 0) > + soassertlocked(so); > + > error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); > /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ > KASSERT(error == 0); > @@ -1275,6 +1308,16 @@ sorflush(struct socket *so) > m_purge(m); > } > > +void > +sorflush(struct socket *so) > +{ > + if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0) > + solock(so); > + sorflush_locked(so); > + if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0) > + sounlock(so); > +} > + > #ifdef SOCKET_SPLICE > > #define so_splicelen so_sp->ssp_len > @@ -1913,7 +1956,8 @@ sosetopt(struct socket *so, int level, i > if ((long)cnt <= 0) > cnt = 1; > > - solock(so); > + if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0)) > + solock(so); > mtx_enter(&sb->sb_mtx); > > switch (optname) { > @@ -1939,7 +1983,8 @@ sosetopt(struct socket *so, int level, i > } > > mtx_leave(&sb->sb_mtx); > - sounlock(so); > + if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0)) > + sounlock(so); > > break; > } > Index: sys/kern/uipc_socket2.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket2.c,v > retrieving revision 1.147 > diff -u -p -r1.147 uipc_socket2.c > --- sys/kern/uipc_socket2.c 31 Mar 2024 13:50:00 -0000 1.147 > +++ sys/kern/uipc_socket2.c 31 Mar 2024 20:30:10 -0000 > @@ -350,7 +350,9 @@ socantsendmore(struct socket *so) > void > socantrcvmore(struct socket *so) > { > - soassertlocked(so); > + if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0) > + soassertlocked(so); > + > mtx_enter(&so->so_rcv.sb_mtx); > so->so_rcv.sb_state |= SS_CANTRCVMORE; > mtx_leave(&so->so_rcv.sb_mtx); > @@ -557,6 +559,17 @@ sblock(struct socket *so, struct sockbuf > { > int error = 0, prio = PSOCK; > > + if (sb->sb_flags & SB_STANDALONE) { > + int rwflags = RW_WRITE; > + > + if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) > + rwflags |= RW_INTR; > + if (!(flags & SBL_WAIT)) > + rwflags |= RW_NOSLEEP; > + > + return rw_enter(&sb->sb_lock, rwflags); > + } > + > soassertlocked(so); > > mtx_enter(&sb->sb_mtx); > @@ -589,6 +602,11 @@ out: > void > sbunlock_locked(struct socket *so, struct sockbuf *sb) > { > + if (sb->sb_flags & SB_STANDALONE) { > + rw_exit(&sb->sb_lock); > + return; > + } > + > MUTEX_ASSERT_LOCKED(&sb->sb_mtx); > > sb->sb_flags &= ~SB_LOCK; > @@ -601,6 +619,11 @@ sbunlock_locked(struct socket *so, struc > void > sbunlock(struct socket *so, struct sockbuf *sb) > { > + if (sb->sb_flags & SB_STANDALONE) { > + rw_exit(&sb->sb_lock); > + return; > + } > + > mtx_enter(&sb->sb_mtx); > sbunlock_locked(so, sb); > mtx_leave(&sb->sb_mtx); > Index: sys/kern/uipc_usrreq.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v > retrieving revision 1.203 > diff -u -p -r1.203 uipc_usrreq.c > --- sys/kern/uipc_usrreq.c 26 Mar 2024 09:46:47 -0000 1.203 > +++ sys/kern/uipc_usrreq.c 31 Mar 2024 20:30:10 -0000 > @@ -1450,9 +1450,7 @@ unp_gc(void *arg __unused) > * thread. > */ > so = unp->unp_socket; > - solock(so); > sorflush(so); > - sounlock(so); > } > } > } > Index: sys/sys/socketvar.h > =================================================================== > RCS file: /cvs/src/sys/sys/socketvar.h,v > retrieving revision 1.127 > diff -u -p -r1.127 socketvar.h > --- sys/sys/socketvar.h 27 Mar 2024 22:47:53 -0000 1.127 > +++ sys/sys/socketvar.h 31 Mar 2024 20:30:10 -0000 > @@ -106,7 +106,8 @@ struct socket { > * Variables for socket buffering. > */ > struct sockbuf { > - struct mutex sb_mtx; > + struct rwlock sb_lock; > + struct mutex sb_mtx; > /* The following fields are all zeroed on flush. */ > #define sb_startzero sb_cc > u_long sb_cc; /* actual chars in buffer */ > @@ -136,6 +137,7 @@ struct socket { > #define SB_NOINTR 0x0040 /* operations not interruptible */ > #define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */ > #define SB_OWNLOCK 0x0100 /* sb_mtx used standalone */ > +#define SB_STANDALONE 0x0200 /* standalone sblock() */ > > void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); > caddr_t so_upcallarg; /* Arg for above */ > @@ -401,6 +403,7 @@ int sosend(struct socket *, struct mbuf > int sosetopt(struct socket *, int, int, struct mbuf *); > int soshutdown(struct socket *, int); > void sorflush(struct socket *); > +void sorflush_locked(struct socket *); > void sowakeup(struct socket *, struct sockbuf *); > void sorwakeup(struct socket *); > void sowwakeup(struct socket *);