Download raw body.
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
On Thu, Apr 11, 2024 at 10:15:43PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Apr 11, 2024 at 03:07:55PM +0200, Alexander Bluhm wrote:
> > On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote:
> > > Really, this big diff will be the hellish one to commit and receive
> > > possible fallout due to very huge area.
> >
> > Better small progress than huge commits.
> >
> > > I see raw sockets the best way to start, because they don't need to call
> >
> > Advantage to proceed with raw sockets is that they are simple and
> > have no socket splicing. But the test coverage is low.
> >
> > > The tcp and udp sockets require
> > > to start sosplice() and somove() reworking, so I want to do this the
> > > last.
> >
> > To get rid of MP races we need parallel stress tests. With UDP
> > this is easier than raw IP. I while ago I used the diff below to
> > run UDP input with shared netlock. But it does not work correctly
> > with somove().
> >
> > To put parallel pressure onto the IP protocol and socket layer we
> > eihter have to make somove() MP safe for UDP, or adapt the diff
> > below for raw IP and write a bunch of tests.
> >
> > With parallel UDP we also get a lot of testing by snapshot users.
> >
>
> This diff triggers soassertlocked() assertion in udp_input(). Seems it
> should be soassertlocked_readonly() because you don't need to grab
> `so_lock'. I tested this diff with this standalone sblock() for udp(4)
> sockets diff, looks stable.
>
> The most problem for udp(4) sockets is splicing ability. However, we can
> hold `sb_mtx' around `ssp_socket' modifications together with solock().
> So the `sb_mtx' will be pretty enough to isspiced() check in
> soreceive(). The unlocked `so_sp' dereference is fine, because we set it
> only once for the whole socket life-time and we do this before
> `ssp_socket' assignment.
>
> The sosplice() was reworked to accept standalone sblock() for udp(4)
> sockets.
>
> We also need to take sblock() before splice sockets, so the sosplice()
> and soreceive() are both serialized. Since `sb_mtx' required to unsplice
> sockets too, it also serializes somove() with soreceive() regardless on
> somove() caller.
>
> soreceive() performs unlocked `so_error' check and modification. I dont
> see problem here. Previously, you could no ability to predict which
> concurrent soreceive() or sosend() thread will fail and clean
> `so_error'. With this unlocked access we could have sosend() and
> soreceive() threads which fails together.
>
> `so_error' stored to local `error2' variable because `so_error' could be
> overwritten by concurrent sosend() thread.
Passes regress with witness.
Running UDP in parallel on top still has assertion warnings.
I think is is a step into the right direction.
OK bluhm@
> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.329
> diff -u -p -r1.329 uipc_socket.c
> --- sys/kern/uipc_socket.c 11 Apr 2024 13:32:51 -0000 1.329
> +++ sys/kern/uipc_socket.c 11 Apr 2024 17:22:39 -0000
> @@ -159,8 +159,6 @@ soalloc(const struct protosw *prp, int w
> case AF_INET6:
> switch (prp->pr_type) {
> case SOCK_DGRAM:
> - so->so_rcv.sb_flags |= SB_MTXLOCK;
> - break;
> case SOCK_RAW:
> so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
> break;
> @@ -819,7 +817,7 @@ soreceive(struct socket *so, struct mbuf
> struct mbuf *m, **mp;
> struct mbuf *cm;
> u_long len, offset, moff;
> - int flags, error, type, uio_error = 0;
> + int flags, error, error2, type, uio_error = 0;
> const struct protosw *pr = so->so_proto;
> struct mbuf *nextrecord;
> size_t resid, orig_resid = uio->uio_resid;
> @@ -889,10 +887,10 @@ restart:
> panic("receive 1: so %p, so_type %d, sb_cc %lu",
> so, so->so_type, so->so_rcv.sb_cc);
> #endif
> - if (so->so_error) {
> + if ((error2 = READ_ONCE(so->so_error))) {
> if (m)
> goto dontblock;
> - error = so->so_error;
> + error = error2;
> if ((flags & MSG_PEEK) == 0)
> so->so_error = 0;
> goto release;
> @@ -1289,7 +1287,13 @@ sorflush_locked(struct socket *so)
> error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
> /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
> KASSERT(error == 0);
> +
> + if (sb->sb_flags & SB_OWNLOCK)
> + solock(so);
> socantrcvmore(so);
> + if (sb->sb_flags & SB_OWNLOCK)
> + sounlock(so);
> +
> mtx_enter(&sb->sb_mtx);
> m = sb->sb_mb;
> memset(&sb->sb_startzero, 0,
> @@ -1323,13 +1327,17 @@ sorflush(struct socket *so)
> int
> sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
> {
> - struct file *fp;
> + struct file *fp = NULL;
> struct socket *sosp;
> - struct sosplice *sp;
> struct taskq *tq;
> int error = 0;
>
> - soassertlocked(so);
> + if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
> + return (EPROTONOSUPPORT);
> + if (max && max < 0)
> + return (EINVAL);
> + if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
> + return (EINVAL);
>
> if (sosplice_taskq == NULL) {
> rw_enter_write(&sosplice_lock);
> @@ -1350,63 +1358,51 @@ sosplice(struct socket *so, int fd, off_
> membar_consumer();
> }
>
> - if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
> - return (EPROTONOSUPPORT);
> - if (so->so_options & SO_ACCEPTCONN)
> - return (EOPNOTSUPP);
> + if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> + return (error);
> + solock(so);
> + } else {
> + solock(so);
> + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> + sounlock(so);
> + return (error);
> + }
> + }
> +
> + if (so->so_options & SO_ACCEPTCONN) {
> + error = EOPNOTSUPP;
> + goto out;
> + }
> if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
> - (so->so_proto->pr_flags & PR_CONNREQUIRED))
> - return (ENOTCONN);
> - if (so->so_sp == NULL) {
> - sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> - if (so->so_sp == NULL)
> - so->so_sp = sp;
> - else
> - pool_put(&sosplice_pool, sp);
> + (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
> + error = ENOTCONN;
> + goto out;
> }
> + if (so->so_sp == NULL)
> + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>
> /* If no fd is given, unsplice by removing existing link. */
> if (fd < 0) {
> - /* Lock receive buffer. */
> - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> - return (error);
> - }
> if (so->so_sp->ssp_socket)
> sounsplice(so, so->so_sp->ssp_socket, 0);
> - sbunlock(so, &so->so_rcv);
> - return (0);
> + goto out;
> }
>
> - if (max && max < 0)
> - return (EINVAL);
> -
> - if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
> - return (EINVAL);
> -
> /* Find sosp, the drain socket where data will be spliced into. */
> if ((error = getsock(curproc, fd, &fp)) != 0)
> - return (error);
> + goto out;
> sosp = fp->f_data;
> if (sosp->so_proto->pr_usrreqs->pru_send !=
> so->so_proto->pr_usrreqs->pru_send) {
> error = EPROTONOSUPPORT;
> - goto frele;
> - }
> - if (sosp->so_sp == NULL) {
> - sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> - if (sosp->so_sp == NULL)
> - sosp->so_sp = sp;
> - else
> - pool_put(&sosplice_pool, sp);
> + goto out;
> }
> + if (sosp->so_sp == NULL)
> + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>
> - /* Lock both receive and send buffer. */
> - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> - goto frele;
> - }
> if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) {
> - sbunlock(so, &so->so_rcv);
> - goto frele;
> + goto out;
> }
>
> if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
> @@ -1423,8 +1419,10 @@ sosplice(struct socket *so, int fd, off_
> }
>
> /* Splice so and sosp together. */
> + mtx_enter(&so->so_rcv.sb_mtx);
> so->so_sp->ssp_socket = sosp;
> sosp->so_sp->ssp_soback = so;
> + mtx_leave(&so->so_rcv.sb_mtx);
> so->so_splicelen = 0;
> so->so_splicemax = max;
> if (tv)
> @@ -1447,17 +1445,18 @@ sosplice(struct socket *so, int fd, off_
>
> release:
> sbunlock(sosp, &sosp->so_snd);
> - sbunlock(so, &so->so_rcv);
> - frele:
> - /*
> - * FRELE() must not be called with the socket lock held. It is safe to
> - * release the lock here as long as no other operation happen on the
> - * socket when sosplice() returns. The dance could be avoided by
> - * grabbing the socket lock inside this function.
> - */
> - sounlock(so);
> - FRELE(fp, curproc);
> - solock(so);
> + out:
> + if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> + sounlock(so);
> + sbunlock(so, &so->so_rcv);
> + } else {
> + sbunlock(so, &so->so_rcv);
> + sounlock(so);
> + }
> +
> + if (fp)
> + FRELE(fp, curproc);
> +
> return (error);
> }
>
> @@ -1469,10 +1468,12 @@ sounsplice(struct socket *so, struct soc
> task_del(sosplice_taskq, &so->so_splicetask);
> timeout_del(&so->so_idleto);
> sosp->so_snd.sb_flags &= ~SB_SPLICE;
> +
> mtx_enter(&so->so_rcv.sb_mtx);
> so->so_rcv.sb_flags &= ~SB_SPLICE;
> - mtx_leave(&so->so_rcv.sb_mtx);
> so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
> + mtx_leave(&so->so_rcv.sb_mtx);
> +
> /* Do not wakeup a socket that is about to be freed. */
> if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
> sorwakeup(so);
> @@ -2025,7 +2026,6 @@ sosetopt(struct socket *so, int level, i
> break;
> #ifdef SOCKET_SPLICE
> case SO_SPLICE:
> - solock(so);
> if (m == NULL) {
> error = sosplice(so, -1, 0, NULL);
> } else if (m->m_len < sizeof(int)) {
> @@ -2038,7 +2038,6 @@ sosetopt(struct socket *so, int level, i
> mtod(m, struct splice *)->sp_max,
> &mtod(m, struct splice *)->sp_idle);
> }
> - sounlock(so);
> break;
> #endif /* SOCKET_SPLICE */
>
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets
Don't take solock() in soreceive() for SOCK_RAW inet sockets