From: Alexander Bluhm Subject: Re: Don't take solock() in soreceive() for SOCK_RAW inet sockets To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Mon, 15 Apr 2024 20:52:44 +0200 On Thu, Apr 11, 2024 at 10:15:43PM +0300, Vitaliy Makkoveev wrote: > On Thu, Apr 11, 2024 at 03:07:55PM +0200, Alexander Bluhm wrote: > > On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote: > > > Really, this big diff will be the hellish one to commit and receive > > > possible fallout due to very huge area. > > > > Better small progress than huge commits. > > > > > I see raw sockets the best way to start, because they don't need to call > > > > Advantage to proceed with raw sockets is that they are simple and > > have no socket splicing. But the test coverage is low. > > > > > The tcp and udp sockets require > > > to start sosplice() and somove() reworking, so I want to do this the > > > last. > > > > To get rid of MP races we need parallel stress tests. With UDP > > this is easier than raw IP. I while ago I used the diff below to > > run UDP input with shared netlock. But it does not work correctly > > with somove(). > > > > To put parallel pressure onto the IP protocol and socket layer we > > eihter have to make somove() MP safe for UDP, or adapt the diff > > below for raw IP and write a bunch of tests. > > > > With parallel UDP we also get a lot of testing by snapshot users. > > > > This diff triggers soassertlocked() assertion in udp_input(). Seems it > should be soassertlocked_readonly() because you don't need to grab > `so_lock'. I tested this diff with this standalone sblock() for udp(4) > sockets diff, looks stable. > > The most problem for udp(4) sockets is splicing ability. However, we can > hold `sb_mtx' around `ssp_socket' modifications together with solock(). > So the `sb_mtx' will be pretty enough to isspiced() check in > soreceive(). The unlocked `so_sp' dereference is fine, because we set it > only once for the whole socket life-time and we do this before > `ssp_socket' assignment. > > The sosplice() was reworked to accept standalone sblock() for udp(4) > sockets. > > We also need to take sblock() before splice sockets, so the sosplice() > and soreceive() are both serialized. Since `sb_mtx' required to unsplice > sockets too, it also serializes somove() with soreceive() regardless on > somove() caller. > > soreceive() performs unlocked `so_error' check and modification. I dont > see problem here. Previously, you could no ability to predict which > concurrent soreceive() or sosend() thread will fail and clean > `so_error'. With this unlocked access we could have sosend() and > soreceive() threads which fails together. > > `so_error' stored to local `error2' variable because `so_error' could be > overwritten by concurrent sosend() thread. Passes regress with witness. Running UDP in parallel on top still has assertion warnings. I think is is a step into the right direction. OK bluhm@ > Index: sys/kern/uipc_socket.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket.c,v > retrieving revision 1.329 > diff -u -p -r1.329 uipc_socket.c > --- sys/kern/uipc_socket.c 11 Apr 2024 13:32:51 -0000 1.329 > +++ sys/kern/uipc_socket.c 11 Apr 2024 17:22:39 -0000 > @@ -159,8 +159,6 @@ soalloc(const struct protosw *prp, int w > case AF_INET6: > switch (prp->pr_type) { > case SOCK_DGRAM: > - so->so_rcv.sb_flags |= SB_MTXLOCK; > - break; > case SOCK_RAW: > so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; > break; > @@ -819,7 +817,7 @@ soreceive(struct socket *so, struct mbuf > struct mbuf *m, **mp; > struct mbuf *cm; > u_long len, offset, moff; > - int flags, error, type, uio_error = 0; > + int flags, error, error2, type, uio_error = 0; > const struct protosw *pr = so->so_proto; > struct mbuf *nextrecord; > size_t resid, orig_resid = uio->uio_resid; > @@ -889,10 +887,10 @@ restart: > panic("receive 1: so %p, so_type %d, sb_cc %lu", > so, so->so_type, so->so_rcv.sb_cc); > #endif > - if (so->so_error) { > + if ((error2 = READ_ONCE(so->so_error))) { > if (m) > goto dontblock; > - error = so->so_error; > + error = error2; > if ((flags & MSG_PEEK) == 0) > so->so_error = 0; > goto release; > @@ -1289,7 +1287,13 @@ sorflush_locked(struct socket *so) > error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); > /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ > KASSERT(error == 0); > + > + if (sb->sb_flags & SB_OWNLOCK) > + solock(so); > socantrcvmore(so); > + if (sb->sb_flags & SB_OWNLOCK) > + sounlock(so); > + > mtx_enter(&sb->sb_mtx); > m = sb->sb_mb; > memset(&sb->sb_startzero, 0, > @@ -1323,13 +1327,17 @@ sorflush(struct socket *so) > int > sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) > { > - struct file *fp; > + struct file *fp = NULL; > struct socket *sosp; > - struct sosplice *sp; > struct taskq *tq; > int error = 0; > > - soassertlocked(so); > + if ((so->so_proto->pr_flags & PR_SPLICE) == 0) > + return (EPROTONOSUPPORT); > + if (max && max < 0) > + return (EINVAL); > + if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) > + return (EINVAL); > > if (sosplice_taskq == NULL) { > rw_enter_write(&sosplice_lock); > @@ -1350,63 +1358,51 @@ sosplice(struct socket *so, int fd, off_ > membar_consumer(); > } > > - if ((so->so_proto->pr_flags & PR_SPLICE) == 0) > - return (EPROTONOSUPPORT); > - if (so->so_options & SO_ACCEPTCONN) > - return (EOPNOTSUPP); > + if (so->so_rcv.sb_flags & SB_OWNLOCK) { > + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) > + return (error); > + solock(so); > + } else { > + solock(so); > + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { > + sounlock(so); > + return (error); > + } > + } > + > + if (so->so_options & SO_ACCEPTCONN) { > + error = EOPNOTSUPP; > + goto out; > + } > if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && > - (so->so_proto->pr_flags & PR_CONNREQUIRED)) > - return (ENOTCONN); > - if (so->so_sp == NULL) { > - sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > - if (so->so_sp == NULL) > - so->so_sp = sp; > - else > - pool_put(&sosplice_pool, sp); > + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { > + error = ENOTCONN; > + goto out; > } > + if (so->so_sp == NULL) > + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > > /* If no fd is given, unsplice by removing existing link. */ > if (fd < 0) { > - /* Lock receive buffer. */ > - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { > - return (error); > - } > if (so->so_sp->ssp_socket) > sounsplice(so, so->so_sp->ssp_socket, 0); > - sbunlock(so, &so->so_rcv); > - return (0); > + goto out; > } > > - if (max && max < 0) > - return (EINVAL); > - > - if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) > - return (EINVAL); > - > /* Find sosp, the drain socket where data will be spliced into. */ > if ((error = getsock(curproc, fd, &fp)) != 0) > - return (error); > + goto out; > sosp = fp->f_data; > if (sosp->so_proto->pr_usrreqs->pru_send != > so->so_proto->pr_usrreqs->pru_send) { > error = EPROTONOSUPPORT; > - goto frele; > - } > - if (sosp->so_sp == NULL) { > - sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > - if (sosp->so_sp == NULL) > - sosp->so_sp = sp; > - else > - pool_put(&sosplice_pool, sp); > + goto out; > } > + if (sosp->so_sp == NULL) > + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); > > - /* Lock both receive and send buffer. */ > - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { > - goto frele; > - } > if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) { > - sbunlock(so, &so->so_rcv); > - goto frele; > + goto out; > } > > if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { > @@ -1423,8 +1419,10 @@ sosplice(struct socket *so, int fd, off_ > } > > /* Splice so and sosp together. */ > + mtx_enter(&so->so_rcv.sb_mtx); > so->so_sp->ssp_socket = sosp; > sosp->so_sp->ssp_soback = so; > + mtx_leave(&so->so_rcv.sb_mtx); > so->so_splicelen = 0; > so->so_splicemax = max; > if (tv) > @@ -1447,17 +1445,18 @@ sosplice(struct socket *so, int fd, off_ > > release: > sbunlock(sosp, &sosp->so_snd); > - sbunlock(so, &so->so_rcv); > - frele: > - /* > - * FRELE() must not be called with the socket lock held. It is safe to > - * release the lock here as long as no other operation happen on the > - * socket when sosplice() returns. The dance could be avoided by > - * grabbing the socket lock inside this function. > - */ > - sounlock(so); > - FRELE(fp, curproc); > - solock(so); > + out: > + if (so->so_rcv.sb_flags & SB_OWNLOCK) { > + sounlock(so); > + sbunlock(so, &so->so_rcv); > + } else { > + sbunlock(so, &so->so_rcv); > + sounlock(so); > + } > + > + if (fp) > + FRELE(fp, curproc); > + > return (error); > } > > @@ -1469,10 +1468,12 @@ sounsplice(struct socket *so, struct soc > task_del(sosplice_taskq, &so->so_splicetask); > timeout_del(&so->so_idleto); > sosp->so_snd.sb_flags &= ~SB_SPLICE; > + > mtx_enter(&so->so_rcv.sb_mtx); > so->so_rcv.sb_flags &= ~SB_SPLICE; > - mtx_leave(&so->so_rcv.sb_mtx); > so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; > + mtx_leave(&so->so_rcv.sb_mtx); > + > /* Do not wakeup a socket that is about to be freed. */ > if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) > sorwakeup(so); > @@ -2025,7 +2026,6 @@ sosetopt(struct socket *so, int level, i > break; > #ifdef SOCKET_SPLICE > case SO_SPLICE: > - solock(so); > if (m == NULL) { > error = sosplice(so, -1, 0, NULL); > } else if (m->m_len < sizeof(int)) { > @@ -2038,7 +2038,6 @@ sosetopt(struct socket *so, int level, i > mtod(m, struct splice *)->sp_max, > &mtod(m, struct splice *)->sp_idle); > } > - sounlock(so); > break; > #endif /* SOCKET_SPLICE */ >