Download raw body.
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
On Sun, Mar 31, 2024 at 11:53:02PM +0300, Vitaliy Makkoveev wrote:
> Reduce netlock pressure for inet sockets.
>
> These sockets are not connection oriented, they don't call pru_rcvd(),
> they can't be spliced, they don't set `so_error'. Nothing to protect
> with solock() in soreceive(), filt_soread() and filt_soexcept() paths.
>
> `so_rcv' buffer protected by `sb_mtx' mutex(9), but since it released in
> soreceive() path, sblock() required to serialize concurrent soreceive()
> and sorflush() threads. Current sblock() is some kind of rwlock(9)
> implementation, so introduce `sb_lock' and it directly for that purpose.
>
> The sorflush() and callers were refactored to avoid solock() for raw
> inet sockets. This done to avoid packet processing stop.
>
> The new SB_STANDALONE flag is partially redundant WITH SB_OWNLOCK, but
> they will be merged in one flag later.
Passed regress with witness. But it found splassert:
splassert: soassertlocked: want 1 have 0
Starting stack trace...
soassertlocked(1) at soassertlocked+0xc7
splassert: soassertlocked: want 1 have 0
Parallel traceback, suppressed...
soassertlocked(d782f188) at soassertlocked+0xc7
sbreserve(d782f188,d782f270,10000) at sbreserve+0x19
sosetopt(d782f188,ffff,1001,dabe6400) at sosetopt+0x295
sys_setsockopt(d7709cb8,f5fbf850,f5fbf848) at sys_setsockopt+0x126
syscall(f5fbf890) at syscall+0x41d
Xsyscall_untramp() at Xsyscall_untramp+0xa9
I am not a big fan of another SB_ flag. It is hard enough to
understand the others.
I have not done a propper review yet.
bluhm
> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.325
> diff -u -p -r1.325 uipc_socket.c
> --- sys/kern/uipc_socket.c 31 Mar 2024 14:01:28 -0000 1.325
> +++ sys/kern/uipc_socket.c 31 Mar 2024 20:30:10 -0000
> @@ -142,6 +142,8 @@ soalloc(const struct protosw *prp, int w
> return (NULL);
> rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
> refcnt_init(&so->so_refcnt);
> + rw_init(&so->so_rcv.sb_lock, "sbufrcv");
> + rw_init(&so->so_snd.sb_lock, "sbufsnd");
> mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
> mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
> klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
> @@ -155,10 +157,10 @@ soalloc(const struct protosw *prp, int w
> case AF_INET6:
> switch (prp->pr_type) {
> case SOCK_DGRAM:
> - so->so_rcv.sb_flags |= SB_OWNLOCK;
> - /* FALLTHROUGH */
> + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
> + break;
> case SOCK_RAW:
> - so->so_rcv.sb_flags |= SB_MTXLOCK;
> + so->so_rcv.sb_flags |= SB_MTXLOCK | SB_STANDALONE;
> break;
> }
> break;
> @@ -345,9 +347,22 @@ sofree(struct socket *so, int keep_lock)
> }
> #endif /* SOCKET_SPLICE */
> sbrelease(so, &so->so_snd);
> - sorflush(so);
> - if (!keep_lock)
> +
> + /*
> + * Regardless on '_locked' postfix, must release solock() before
> + * call sorflush_locked() for SB_STANDALONE marked socket. Can't
> + * release solock() and call sorflush() because solock() release
> + * is unwanted for tcp(4) socket.
> + */
> +
> + if (so->so_rcv.sb_flags & SB_STANDALONE)
> sounlock(so);
> +
> + sorflush_locked(so);
> +
> + if (!((so->so_rcv.sb_flags & SB_STANDALONE) || keep_lock))
> + sounlock(so);
> +
> #ifdef SOCKET_SPLICE
> if (so->so_sp) {
> /* Reuse splice idle, sounsplice() has been called before. */
> @@ -815,6 +830,7 @@ soreceive(struct socket *so, struct mbuf
> const struct protosw *pr = so->so_proto;
> struct mbuf *nextrecord;
> size_t resid, orig_resid = uio->uio_resid;
> + int dosolock = ((so->so_rcv.sb_flags & SB_STANDALONE) == 0);
>
> mp = mp0;
> if (paddr)
> @@ -844,12 +860,11 @@ bad:
> if (mp)
> *mp = NULL;
>
> - solock_shared(so);
> + if (dosolock)
> + solock_shared(so);
> restart:
> - if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
> - sounlock_shared(so);
> - return (error);
> - }
> + if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
> + goto out;
> sb_mtx_lock(&so->so_rcv);
>
> m = so->so_rcv.sb_mb;
> @@ -914,14 +929,16 @@ restart:
> SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
> SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
>
> - if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> + if (so->so_rcv.sb_flags & (SB_OWNLOCK | SB_STANDALONE)) {
> sbunlock_locked(so, &so->so_rcv);
> - sounlock_shared(so);
> + if (dosolock)
> + sounlock_shared(so);
> error = sbwait_locked(so, &so->so_rcv);
> sb_mtx_unlock(&so->so_rcv);
> if (error)
> return (error);
> - solock_shared(so);
> + if (dosolock)
> + solock_shared(so);
> } else {
> sb_mtx_unlock(&so->so_rcv);
> sbunlock(so, &so->so_rcv);
> @@ -998,11 +1015,13 @@ dontblock:
> if (controlp) {
> if (pr->pr_domain->dom_externalize) {
> sb_mtx_unlock(&so->so_rcv);
> - sounlock_shared(so);
> + if (dosolock)
> + sounlock_shared(so);
> error =
> (*pr->pr_domain->dom_externalize)
> (cm, controllen, flags);
> - solock_shared(so);
> + if (dosolock)
> + solock_shared(so);
> sb_mtx_lock(&so->so_rcv);
> }
> *controlp = cm;
> @@ -1081,9 +1100,11 @@ dontblock:
> SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
> resid = uio->uio_resid;
> sb_mtx_unlock(&so->so_rcv);
> - sounlock_shared(so);
> + if (dosolock)
> + sounlock_shared(so);
> uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
> - solock_shared(so);
> + if (dosolock)
> + solock_shared(so);
> sb_mtx_lock(&so->so_rcv);
> if (uio_error)
> uio->uio_resid = resid - len;
> @@ -1166,14 +1187,21 @@ dontblock:
> break;
> SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
> SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
> - sb_mtx_unlock(&so->so_rcv);
> - error = sbwait(so, &so->so_rcv);
> - if (error) {
> - sbunlock(so, &so->so_rcv);
> - sounlock_shared(so);
> - return (0);
> + if (dosolock) {
> + sb_mtx_unlock(&so->so_rcv);
> + error = sbwait(so, &so->so_rcv);
> + if (error) {
> + sbunlock(so, &so->so_rcv);
> + sounlock_shared(so);
> + return (0);
> + }
> + sb_mtx_lock(&so->so_rcv);
> + } else {
> + if (sbwait_locked(so, &so->so_rcv)) {
> + error = 0;
> + goto release;
> + }
> }
> - sb_mtx_lock(&so->so_rcv);
> if ((m = so->so_rcv.sb_mb) != NULL)
> nextrecord = m->m_nextpkt;
> }
> @@ -1222,7 +1250,9 @@ dontblock:
> release:
> sb_mtx_unlock(&so->so_rcv);
> sbunlock(so, &so->so_rcv);
> - sounlock_shared(so);
> +out:
> + if (dosolock)
> + sounlock_shared(so);
> return (error);
> }
>
> @@ -1231,7 +1261,6 @@ soshutdown(struct socket *so, int how)
> {
> int error = 0;
>
> - solock(so);
> switch (how) {
> case SHUT_RD:
> sorflush(so);
> @@ -1240,25 +1269,29 @@ soshutdown(struct socket *so, int how)
> sorflush(so);
> /* FALLTHROUGH */
> case SHUT_WR:
> + solock(so);
> error = pru_shutdown(so);
> + sounlock(so);
> break;
> default:
> error = EINVAL;
> break;
> }
> - sounlock(so);
>
> return (error);
> }
>
> void
> -sorflush(struct socket *so)
> +sorflush_locked(struct socket *so)
> {
> struct sockbuf *sb = &so->so_rcv;
> struct mbuf *m;
> const struct protosw *pr = so->so_proto;
> int error;
>
> + if ((sb->sb_flags & SB_STANDALONE) == 0)
> + soassertlocked(so);
> +
> error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
> /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
> KASSERT(error == 0);
> @@ -1275,6 +1308,16 @@ sorflush(struct socket *so)
> m_purge(m);
> }
>
> +void
> +sorflush(struct socket *so)
> +{
> + if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
> + solock(so);
> + sorflush_locked(so);
> + if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
> + sounlock(so);
> +}
> +
> #ifdef SOCKET_SPLICE
>
> #define so_splicelen so_sp->ssp_len
> @@ -1913,7 +1956,8 @@ sosetopt(struct socket *so, int level, i
> if ((long)cnt <= 0)
> cnt = 1;
>
> - solock(so);
> + if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
> + solock(so);
> mtx_enter(&sb->sb_mtx);
>
> switch (optname) {
> @@ -1939,7 +1983,8 @@ sosetopt(struct socket *so, int level, i
> }
>
> mtx_leave(&sb->sb_mtx);
> - sounlock(so);
> + if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
> + sounlock(so);
>
> break;
> }
> Index: sys/kern/uipc_socket2.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 uipc_socket2.c
> --- sys/kern/uipc_socket2.c 31 Mar 2024 13:50:00 -0000 1.147
> +++ sys/kern/uipc_socket2.c 31 Mar 2024 20:30:10 -0000
> @@ -350,7 +350,9 @@ socantsendmore(struct socket *so)
> void
> socantrcvmore(struct socket *so)
> {
> - soassertlocked(so);
> + if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
> + soassertlocked(so);
> +
> mtx_enter(&so->so_rcv.sb_mtx);
> so->so_rcv.sb_state |= SS_CANTRCVMORE;
> mtx_leave(&so->so_rcv.sb_mtx);
> @@ -557,6 +559,17 @@ sblock(struct socket *so, struct sockbuf
> {
> int error = 0, prio = PSOCK;
>
> + if (sb->sb_flags & SB_STANDALONE) {
> + int rwflags = RW_WRITE;
> +
> + if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
> + rwflags |= RW_INTR;
> + if (!(flags & SBL_WAIT))
> + rwflags |= RW_NOSLEEP;
> +
> + return rw_enter(&sb->sb_lock, rwflags);
> + }
> +
> soassertlocked(so);
>
> mtx_enter(&sb->sb_mtx);
> @@ -589,6 +602,11 @@ out:
> void
> sbunlock_locked(struct socket *so, struct sockbuf *sb)
> {
> + if (sb->sb_flags & SB_STANDALONE) {
> + rw_exit(&sb->sb_lock);
> + return;
> + }
> +
> MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
>
> sb->sb_flags &= ~SB_LOCK;
> @@ -601,6 +619,11 @@ sbunlock_locked(struct socket *so, struc
> void
> sbunlock(struct socket *so, struct sockbuf *sb)
> {
> + if (sb->sb_flags & SB_STANDALONE) {
> + rw_exit(&sb->sb_lock);
> + return;
> + }
> +
> mtx_enter(&sb->sb_mtx);
> sbunlock_locked(so, sb);
> mtx_leave(&sb->sb_mtx);
> Index: sys/kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.203
> diff -u -p -r1.203 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c 26 Mar 2024 09:46:47 -0000 1.203
> +++ sys/kern/uipc_usrreq.c 31 Mar 2024 20:30:10 -0000
> @@ -1450,9 +1450,7 @@ unp_gc(void *arg __unused)
> * thread.
> */
> so = unp->unp_socket;
> - solock(so);
> sorflush(so);
> - sounlock(so);
> }
> }
> }
> Index: sys/sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.127
> diff -u -p -r1.127 socketvar.h
> --- sys/sys/socketvar.h 27 Mar 2024 22:47:53 -0000 1.127
> +++ sys/sys/socketvar.h 31 Mar 2024 20:30:10 -0000
> @@ -106,7 +106,8 @@ struct socket {
> * Variables for socket buffering.
> */
> struct sockbuf {
> - struct mutex sb_mtx;
> + struct rwlock sb_lock;
> + struct mutex sb_mtx;
> /* The following fields are all zeroed on flush. */
> #define sb_startzero sb_cc
> u_long sb_cc; /* actual chars in buffer */
> @@ -136,6 +137,7 @@ struct socket {
> #define SB_NOINTR 0x0040 /* operations not interruptible */
> #define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */
> #define SB_OWNLOCK 0x0100 /* sb_mtx used standalone */
> +#define SB_STANDALONE 0x0200 /* standalone sblock() */
>
> void (*so_upcall)(struct socket *so, caddr_t arg, int waitf);
> caddr_t so_upcallarg; /* Arg for above */
> @@ -401,6 +403,7 @@ int sosend(struct socket *, struct mbuf
> int sosetopt(struct socket *, int, int, struct mbuf *);
> int soshutdown(struct socket *, int);
> void sorflush(struct socket *);
> +void sorflush_locked(struct socket *);
> void sowakeup(struct socket *, struct sockbuf *);
> void sorwakeup(struct socket *);
> void sowwakeup(struct socket *);
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets