Download raw body.
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
On Wed, Apr 03, 2024 at 09:57:09PM +0200, Alexander Bluhm wrote:
> On Sun, Mar 31, 2024 at 11:53:02PM +0300, Vitaliy Makkoveev wrote:
> > Reduce netlock pressure for inet sockets.
> >
> > These sockets are not connection oriented, they don't call pru_rcvd(),
> > they can't be spliced, they don't set `so_error'. Nothing to protect
> > with solock() in soreceive(), filt_soread() and filt_soexcept() paths.
> >
> > `so_rcv' buffer protected by `sb_mtx' mutex(9), but since it released in
> > soreceive() path, sblock() required to serialize concurrent soreceive()
> > and sorflush() threads. Current sblock() is some kind of rwlock(9)
> > implementation, so introduce `sb_lock' and it directly for that purpose.
> >
> > The sorflush() and callers were refactored to avoid solock() for raw
> > inet sockets. This done to avoid packet processing stop.
> >
> > The new SB_STANDALONE flag is partially redundant WITH SB_OWNLOCK, but
> > they will be merged in one flag later.
>
> Passed regress with witness. But it found splassert:
>
> splassert: soassertlocked: want 1 have 0
> Starting stack trace...
> soassertlocked(1) at soassertlocked+0xc7
> splassert: soassertlocked: want 1 have 0
> Parallel traceback, suppressed...
> soassertlocked(d782f188) at soassertlocked+0xc7
> sbreserve(d782f188,d782f270,10000) at sbreserve+0x19
> sosetopt(d782f188,ffff,1001,dabe6400) at sosetopt+0x295
> sys_setsockopt(d7709cb8,f5fbf850,f5fbf848) at sys_setsockopt+0x126
> syscall(f5fbf890) at syscall+0x41d
> Xsyscall_untramp() at Xsyscall_untramp+0xa9
>
This one and corresponding sounlock() should check sb->sb_flags instead
of so->so_rcv.sb_flags. Otherwise `so_snd' sockbuf will be unlocked. I
fixed this in the updated diff.
> > @@ -1913,7 +1956,8 @@ sosetopt(struct socket *so, int level, i
> > if ((long)cnt <= 0)
> > cnt = 1;
> >
> > - solock(so);
> > + if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
> > + solock(so);
> > mtx_enter(&sb->sb_mtx);
> >
> I am not a big fan of another SB_ flag. It is hard enough to
> understand the others.
>
Yes, SB_OWNLOCK is not need because all except SOCK_RAW sockets use both
SB_MTXLOCK and SB_OWNLOCK. This diff modifies SOCK_RAW sockets, so now
SB_MTXLOCK used to modify sbwait() behaviour and SB_OWNLOCK to mark
sblock() as standalone.
I want to remove solock() from soreceive() for all sockets, so these
flags are temporary.
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.327
diff -u -p -r1.327 uipc_socket.c
--- sys/kern/uipc_socket.c 2 Apr 2024 14:23:15 -0000 1.327
+++ sys/kern/uipc_socket.c 3 Apr 2024 21:56:06 -0000
@@ -142,6 +142,8 @@ soalloc(const struct protosw *prp, int w
return (NULL);
rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
refcnt_init(&so->so_refcnt);
+ rw_init(&so->so_rcv.sb_lock, "sbufrcv");
+ rw_init(&so->so_snd.sb_lock, "sbufsnd");
mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
@@ -155,15 +157,15 @@ soalloc(const struct protosw *prp, int w
case AF_INET6:
switch (prp->pr_type) {
case SOCK_DGRAM:
- so->so_rcv.sb_flags |= SB_OWNLOCK;
- /* FALLTHROUGH */
- case SOCK_RAW:
so->so_rcv.sb_flags |= SB_MTXLOCK;
break;
+ case SOCK_RAW:
+ so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+ break;
}
break;
case AF_UNIX:
- so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+ so->so_rcv.sb_flags |= SB_MTXLOCK;
break;
}
@@ -345,9 +347,22 @@ sofree(struct socket *so, int keep_lock)
}
#endif /* SOCKET_SPLICE */
sbrelease(so, &so->so_snd);
- sorflush(so);
- if (!keep_lock)
+
+ /*
+ * Regardless on '_locked' postfix, must release solock() before
+ * call sorflush_locked() for SB_OWNLOCK marked socket. Can't
+ * release solock() and call sorflush() because solock() release
+ * is unwanted for tcp(4) socket.
+ */
+
+ if (so->so_rcv.sb_flags & SB_OWNLOCK)
+ sounlock(so);
+
+ sorflush_locked(so);
+
+ if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock))
sounlock(so);
+
#ifdef SOCKET_SPLICE
if (so->so_sp) {
/* Reuse splice idle, sounsplice() has been called before. */
@@ -815,6 +830,7 @@ soreceive(struct socket *so, struct mbuf
const struct protosw *pr = so->so_proto;
struct mbuf *nextrecord;
size_t resid, orig_resid = uio->uio_resid;
+ int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0);
mp = mp0;
if (paddr)
@@ -844,12 +860,11 @@ bad:
if (mp)
*mp = NULL;
- solock_shared(so);
+ if (dosolock)
+ solock_shared(so);
restart:
- if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
- sounlock_shared(so);
- return (error);
- }
+ if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
+ goto out;
sb_mtx_lock(&so->so_rcv);
m = so->so_rcv.sb_mb;
@@ -914,14 +929,16 @@ restart:
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
- if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+ if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) {
sbunlock_locked(so, &so->so_rcv);
- sounlock_shared(so);
+ if (dosolock)
+ sounlock_shared(so);
error = sbwait_locked(so, &so->so_rcv);
sb_mtx_unlock(&so->so_rcv);
if (error)
return (error);
- solock_shared(so);
+ if (dosolock)
+ solock_shared(so);
} else {
sb_mtx_unlock(&so->so_rcv);
sbunlock(so, &so->so_rcv);
@@ -998,11 +1015,13 @@ dontblock:
if (controlp) {
if (pr->pr_domain->dom_externalize) {
sb_mtx_unlock(&so->so_rcv);
- sounlock_shared(so);
+ if (dosolock)
+ sounlock_shared(so);
error =
(*pr->pr_domain->dom_externalize)
(cm, controllen, flags);
- solock_shared(so);
+ if (dosolock)
+ solock_shared(so);
sb_mtx_lock(&so->so_rcv);
}
*controlp = cm;
@@ -1081,9 +1100,11 @@ dontblock:
SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
resid = uio->uio_resid;
sb_mtx_unlock(&so->so_rcv);
- sounlock_shared(so);
+ if (dosolock)
+ sounlock_shared(so);
uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
- solock_shared(so);
+ if (dosolock)
+ solock_shared(so);
sb_mtx_lock(&so->so_rcv);
if (uio_error)
uio->uio_resid = resid - len;
@@ -1166,14 +1187,21 @@ dontblock:
break;
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
- sb_mtx_unlock(&so->so_rcv);
- error = sbwait(so, &so->so_rcv);
- if (error) {
- sbunlock(so, &so->so_rcv);
- sounlock_shared(so);
- return (0);
+ if (dosolock) {
+ sb_mtx_unlock(&so->so_rcv);
+ error = sbwait(so, &so->so_rcv);
+ if (error) {
+ sbunlock(so, &so->so_rcv);
+ sounlock_shared(so);
+ return (0);
+ }
+ sb_mtx_lock(&so->so_rcv);
+ } else {
+ if (sbwait_locked(so, &so->so_rcv)) {
+ error = 0;
+ goto release;
+ }
}
- sb_mtx_lock(&so->so_rcv);
if ((m = so->so_rcv.sb_mb) != NULL)
nextrecord = m->m_nextpkt;
}
@@ -1222,7 +1250,9 @@ dontblock:
release:
sb_mtx_unlock(&so->so_rcv);
sbunlock(so, &so->so_rcv);
- sounlock_shared(so);
+out:
+ if (dosolock)
+ sounlock_shared(so);
return (error);
}
@@ -1231,7 +1261,6 @@ soshutdown(struct socket *so, int how)
{
int error = 0;
- solock(so);
switch (how) {
case SHUT_RD:
sorflush(so);
@@ -1240,25 +1269,29 @@ soshutdown(struct socket *so, int how)
sorflush(so);
/* FALLTHROUGH */
case SHUT_WR:
+ solock(so);
error = pru_shutdown(so);
+ sounlock(so);
break;
default:
error = EINVAL;
break;
}
- sounlock(so);
return (error);
}
void
-sorflush(struct socket *so)
+sorflush_locked(struct socket *so)
{
struct sockbuf *sb = &so->so_rcv;
struct mbuf *m;
const struct protosw *pr = so->so_proto;
int error;
+ if ((sb->sb_flags & SB_OWNLOCK) == 0)
+ soassertlocked(so);
+
error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
KASSERT(error == 0);
@@ -1275,6 +1308,16 @@ sorflush(struct socket *so)
m_purge(m);
}
+void
+sorflush(struct socket *so)
+{
+ if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+ solock(so);
+ sorflush_locked(so);
+ if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+ sounlock(so);
+}
+
#ifdef SOCKET_SPLICE
#define so_splicelen so_sp->ssp_len
@@ -1913,7 +1956,8 @@ sosetopt(struct socket *so, int level, i
if ((long)cnt <= 0)
cnt = 1;
- solock(so);
+ if (((sb->sb_flags & SB_OWNLOCK) == 0))
+ solock(so);
mtx_enter(&sb->sb_mtx);
switch (optname) {
@@ -1939,7 +1983,8 @@ sosetopt(struct socket *so, int level, i
}
mtx_leave(&sb->sb_mtx);
- sounlock(so);
+ if (((sb->sb_flags & SB_OWNLOCK) == 0))
+ sounlock(so);
break;
}
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.147
diff -u -p -r1.147 uipc_socket2.c
--- sys/kern/uipc_socket2.c 31 Mar 2024 13:50:00 -0000 1.147
+++ sys/kern/uipc_socket2.c 3 Apr 2024 21:56:06 -0000
@@ -350,7 +350,9 @@ socantsendmore(struct socket *so)
void
socantrcvmore(struct socket *so)
{
- soassertlocked(so);
+ if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+ soassertlocked(so);
+
mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
mtx_leave(&so->so_rcv.sb_mtx);
@@ -557,6 +559,17 @@ sblock(struct socket *so, struct sockbuf
{
int error = 0, prio = PSOCK;
+ if (sb->sb_flags & SB_OWNLOCK) {
+ int rwflags = RW_WRITE;
+
+ if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
+ rwflags |= RW_INTR;
+ if (!(flags & SBL_WAIT))
+ rwflags |= RW_NOSLEEP;
+
+ return rw_enter(&sb->sb_lock, rwflags);
+ }
+
soassertlocked(so);
mtx_enter(&sb->sb_mtx);
@@ -589,6 +602,11 @@ out:
void
sbunlock_locked(struct socket *so, struct sockbuf *sb)
{
+ if (sb->sb_flags & SB_OWNLOCK) {
+ rw_exit(&sb->sb_lock);
+ return;
+ }
+
MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
sb->sb_flags &= ~SB_LOCK;
@@ -601,6 +619,11 @@ sbunlock_locked(struct socket *so, struc
void
sbunlock(struct socket *so, struct sockbuf *sb)
{
+ if (sb->sb_flags & SB_OWNLOCK) {
+ rw_exit(&sb->sb_lock);
+ return;
+ }
+
mtx_enter(&sb->sb_mtx);
sbunlock_locked(so, sb);
mtx_leave(&sb->sb_mtx);
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.203
diff -u -p -r1.203 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 26 Mar 2024 09:46:47 -0000 1.203
+++ sys/kern/uipc_usrreq.c 3 Apr 2024 21:56:06 -0000
@@ -1450,9 +1450,7 @@ unp_gc(void *arg __unused)
* thread.
*/
so = unp->unp_socket;
- solock(so);
sorflush(so);
- sounlock(so);
}
}
}
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.127
diff -u -p -r1.127 socketvar.h
--- sys/sys/socketvar.h 27 Mar 2024 22:47:53 -0000 1.127
+++ sys/sys/socketvar.h 3 Apr 2024 21:56:06 -0000
@@ -106,7 +106,8 @@ struct socket {
* Variables for socket buffering.
*/
struct sockbuf {
- struct mutex sb_mtx;
+ struct rwlock sb_lock;
+ struct mutex sb_mtx;
/* The following fields are all zeroed on flush. */
#define sb_startzero sb_cc
u_long sb_cc; /* actual chars in buffer */
@@ -135,7 +136,7 @@ struct socket {
#define SB_SPLICE 0x0020 /* buffer is splice source or drain */
#define SB_NOINTR 0x0040 /* operations not interruptible */
#define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */
-#define SB_OWNLOCK 0x0100 /* sb_mtx used standalone */
+#define SB_OWNLOCK 0x0100 /* sblock() doesn't need solock() */
void (*so_upcall)(struct socket *so, caddr_t arg, int waitf);
caddr_t so_upcallarg; /* Arg for above */
@@ -401,6 +402,7 @@ int sosend(struct socket *, struct mbuf
int sosetopt(struct socket *, int, int, struct mbuf *);
int soshutdown(struct socket *, int);
void sorflush(struct socket *);
+void sorflush_locked(struct socket *);
void sowakeup(struct socket *, struct sockbuf *);
void sorwakeup(struct socket *);
void sowwakeup(struct socket *);
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets