Download raw body.
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
On Thu, Apr 11, 2024 at 03:07:55PM +0200, Alexander Bluhm wrote:
> On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote:
> > Really, this big diff will be the hellish one to commit and receive
> > possible fallout due to very huge area.
>
> Better small progress than huge commits.
>
> > I see raw sockets the best way to start, because they don't need to call
>
> Advantage to proceed with raw sockets is that they are simple and
> have no socket splicing. But the test coverage is low.
>
> > The tcp and udp sockets require
> > to start sosplice() and somove() reworking, so I want to do this the
> > last.
>
> To get rid of MP races we need parallel stress tests. With UDP
> this is easier than raw IP. I while ago I used the diff below to
> run UDP input with shared netlock. But it does not work correctly
> with somove().
>
> To put parallel pressure onto the IP protocol and socket layer we
> eihter have to make somove() MP safe for UDP, or adapt the diff
> below for raw IP and write a bunch of tests.
>
> With parallel UDP we also get a lot of testing by snapshot users.
>
This diff triggers soassertlocked() assertion in udp_input(). Seems it
should be soassertlocked_readonly() because you don't need to grab
`so_lock'. I tested this diff with this standalone sblock() for udp(4)
sockets diff, looks stable.
The most problem for udp(4) sockets is splicing ability. However, we can
hold `sb_mtx' around `ssp_socket' modifications together with solock().
So the `sb_mtx' will be pretty enough to isspiced() check in
soreceive(). The unlocked `so_sp' dereference is fine, because we set it
only once for the whole socket life-time and we do this before
`ssp_socket' assignment.
The sosplice() was reworked to accept standalone sblock() for udp(4)
sockets.
We also need to take sblock() before splice sockets, so the sosplice()
and soreceive() are both serialized. Since `sb_mtx' required to unsplice
sockets too, it also serializes somove() with soreceive() regardless on
somove() caller.
soreceive() performs unlocked `so_error' check and modification. I dont
see problem here. Previously, you could no ability to predict which
concurrent soreceive() or sosend() thread will fail and clean
`so_error'. With this unlocked access we could have sosend() and
soreceive() threads which fails together.
`so_error' stored to local `error2' variable because `so_error' could be
overwritten by concurrent sosend() thread.
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.329
diff -u -p -r1.329 uipc_socket.c
--- sys/kern/uipc_socket.c 11 Apr 2024 13:32:51 -0000 1.329
+++ sys/kern/uipc_socket.c 11 Apr 2024 17:22:39 -0000
@@ -159,8 +159,6 @@ soalloc(const struct protosw *prp, int w
case AF_INET6:
switch (prp->pr_type) {
case SOCK_DGRAM:
- so->so_rcv.sb_flags |= SB_MTXLOCK;
- break;
case SOCK_RAW:
so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
break;
@@ -819,7 +817,7 @@ soreceive(struct socket *so, struct mbuf
struct mbuf *m, **mp;
struct mbuf *cm;
u_long len, offset, moff;
- int flags, error, type, uio_error = 0;
+ int flags, error, error2, type, uio_error = 0;
const struct protosw *pr = so->so_proto;
struct mbuf *nextrecord;
size_t resid, orig_resid = uio->uio_resid;
@@ -889,10 +887,10 @@ restart:
panic("receive 1: so %p, so_type %d, sb_cc %lu",
so, so->so_type, so->so_rcv.sb_cc);
#endif
- if (so->so_error) {
+ if ((error2 = READ_ONCE(so->so_error))) {
if (m)
goto dontblock;
- error = so->so_error;
+ error = error2;
if ((flags & MSG_PEEK) == 0)
so->so_error = 0;
goto release;
@@ -1289,7 +1287,13 @@ sorflush_locked(struct socket *so)
error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
KASSERT(error == 0);
+
+ if (sb->sb_flags & SB_OWNLOCK)
+ solock(so);
socantrcvmore(so);
+ if (sb->sb_flags & SB_OWNLOCK)
+ sounlock(so);
+
mtx_enter(&sb->sb_mtx);
m = sb->sb_mb;
memset(&sb->sb_startzero, 0,
@@ -1323,13 +1327,17 @@ sorflush(struct socket *so)
int
sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
{
- struct file *fp;
+ struct file *fp = NULL;
struct socket *sosp;
- struct sosplice *sp;
struct taskq *tq;
int error = 0;
- soassertlocked(so);
+ if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
+ return (EPROTONOSUPPORT);
+ if (max && max < 0)
+ return (EINVAL);
+ if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
+ return (EINVAL);
if (sosplice_taskq == NULL) {
rw_enter_write(&sosplice_lock);
@@ -1350,63 +1358,51 @@ sosplice(struct socket *so, int fd, off_
membar_consumer();
}
- if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
- return (EPROTONOSUPPORT);
- if (so->so_options & SO_ACCEPTCONN)
- return (EOPNOTSUPP);
+ if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+ if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
+ return (error);
+ solock(so);
+ } else {
+ solock(so);
+ if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
+ sounlock(so);
+ return (error);
+ }
+ }
+
+ if (so->so_options & SO_ACCEPTCONN) {
+ error = EOPNOTSUPP;
+ goto out;
+ }
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
- (so->so_proto->pr_flags & PR_CONNREQUIRED))
- return (ENOTCONN);
- if (so->so_sp == NULL) {
- sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
- if (so->so_sp == NULL)
- so->so_sp = sp;
- else
- pool_put(&sosplice_pool, sp);
+ (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+ error = ENOTCONN;
+ goto out;
}
+ if (so->so_sp == NULL)
+ so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
/* If no fd is given, unsplice by removing existing link. */
if (fd < 0) {
- /* Lock receive buffer. */
- if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
- return (error);
- }
if (so->so_sp->ssp_socket)
sounsplice(so, so->so_sp->ssp_socket, 0);
- sbunlock(so, &so->so_rcv);
- return (0);
+ goto out;
}
- if (max && max < 0)
- return (EINVAL);
-
- if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
- return (EINVAL);
-
/* Find sosp, the drain socket where data will be spliced into. */
if ((error = getsock(curproc, fd, &fp)) != 0)
- return (error);
+ goto out;
sosp = fp->f_data;
if (sosp->so_proto->pr_usrreqs->pru_send !=
so->so_proto->pr_usrreqs->pru_send) {
error = EPROTONOSUPPORT;
- goto frele;
- }
- if (sosp->so_sp == NULL) {
- sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
- if (sosp->so_sp == NULL)
- sosp->so_sp = sp;
- else
- pool_put(&sosplice_pool, sp);
+ goto out;
}
+ if (sosp->so_sp == NULL)
+ sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
- /* Lock both receive and send buffer. */
- if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
- goto frele;
- }
if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) {
- sbunlock(so, &so->so_rcv);
- goto frele;
+ goto out;
}
if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
@@ -1423,8 +1419,10 @@ sosplice(struct socket *so, int fd, off_
}
/* Splice so and sosp together. */
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_sp->ssp_socket = sosp;
sosp->so_sp->ssp_soback = so;
+ mtx_leave(&so->so_rcv.sb_mtx);
so->so_splicelen = 0;
so->so_splicemax = max;
if (tv)
@@ -1447,17 +1445,18 @@ sosplice(struct socket *so, int fd, off_
release:
sbunlock(sosp, &sosp->so_snd);
- sbunlock(so, &so->so_rcv);
- frele:
- /*
- * FRELE() must not be called with the socket lock held. It is safe to
- * release the lock here as long as no other operation happen on the
- * socket when sosplice() returns. The dance could be avoided by
- * grabbing the socket lock inside this function.
- */
- sounlock(so);
- FRELE(fp, curproc);
- solock(so);
+ out:
+ if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+ sounlock(so);
+ sbunlock(so, &so->so_rcv);
+ } else {
+ sbunlock(so, &so->so_rcv);
+ sounlock(so);
+ }
+
+ if (fp)
+ FRELE(fp, curproc);
+
return (error);
}
@@ -1469,10 +1468,12 @@ sounsplice(struct socket *so, struct soc
task_del(sosplice_taskq, &so->so_splicetask);
timeout_del(&so->so_idleto);
sosp->so_snd.sb_flags &= ~SB_SPLICE;
+
mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_flags &= ~SB_SPLICE;
- mtx_leave(&so->so_rcv.sb_mtx);
so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
+ mtx_leave(&so->so_rcv.sb_mtx);
+
/* Do not wakeup a socket that is about to be freed. */
if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
sorwakeup(so);
@@ -2025,7 +2026,6 @@ sosetopt(struct socket *so, int level, i
break;
#ifdef SOCKET_SPLICE
case SO_SPLICE:
- solock(so);
if (m == NULL) {
error = sosplice(so, -1, 0, NULL);
} else if (m->m_len < sizeof(int)) {
@@ -2038,7 +2038,6 @@ sosetopt(struct socket *so, int level, i
mtod(m, struct splice *)->sp_max,
&mtod(m, struct splice *)->sp_idle);
}
- sounlock(so);
break;
#endif /* SOCKET_SPLICE */
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets