Mailing List Archive

On Thu, Apr 11, 2024 at 03:07:55PM +0200, Alexander Bluhm wrote: > On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote: > > Really, this big diff will be the hellish one to commit and receive > > possible fallout due to very huge area. > > Better small progress than huge commits. > > > I see raw sockets the best way to start, because they don't need to call > > Advantage to proceed with raw sockets is that they are simple and > have no socket splicing. But the test coverage is low. > > > The tcp and udp sockets require > > to start sosplice() and somove() reworking, so I want to do this the > > last. > > To get rid of MP races we need parallel stress tests. With UDP > this is easier than raw IP. I while ago I used the diff below to > run UDP input with shared netlock. But it does not work correctly > with somove(). > > To put parallel pressure onto the IP protocol and socket layer we > eihter have to make somove() MP safe for UDP, or adapt the diff > below for raw IP and write a bunch of tests. > > With parallel UDP we also get a lot of testing by snapshot users. > This diff triggers soassertlocked() assertion in udp_input(). Seems it should be soassertlocked_readonly() because you don't need to grab `so_lock'. I tested this diff with this standalone sblock() for udp(4) sockets diff, looks stable. The most problem for udp(4) sockets is splicing ability. However, we can hold `sb_mtx' around `ssp_socket' modifications together with solock(). So the `sb_mtx' will be pretty enough to isspiced() check in soreceive(). The unlocked `so_sp' dereference is fine, because we set it only once for the whole socket life-time and we do this before `ssp_socket' assignment. The sosplice() was reworked to accept standalone sblock() for udp(4) sockets. We also need to take sblock() before splice sockets, so the sosplice() and soreceive() are both serialized. Since `sb_mtx' required to unsplice sockets too, it also serializes somove() with soreceive() regardless on somove() caller. soreceive() performs unlocked `so_error' check and modification. I dont see problem here. Previously, you could no ability to predict which concurrent soreceive() or sosend() thread will fail and clean `so_error'. With this unlocked access we could have sosend() and soreceive() threads which fails together. `so_error' stored to local `error2' variable because `so_error' could be overwritten by concurrent sosend() thread. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.329 diff -u -p -r1.329 uipc_socket.c --- sys/kern/uipc_socket.c 11 Apr 2024 13:32:51 -0000 1.329 +++ sys/kern/uipc_socket.c 11 Apr 2024 17:22:39 -0000 @@ -159,8 +159,6 @@ soalloc(const struct protosw *prp, int w case AF_INET6: switch (prp->pr_type) { case SOCK_DGRAM: - so->so_rcv.sb_flags |= SB_MTXLOCK; - break; case SOCK_RAW: so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; break; @@ -819,7 +817,7 @@ soreceive(struct socket *so, struct mbuf struct mbuf *m, **mp; struct mbuf *cm; u_long len, offset, moff; - int flags, error, type, uio_error = 0; + int flags, error, error2, type, uio_error = 0; const struct protosw *pr = so->so_proto; struct mbuf *nextrecord; size_t resid, orig_resid = uio->uio_resid; @@ -889,10 +887,10 @@ restart: panic("receive 1: so %p, so_type %d, sb_cc %lu", so, so->so_type, so->so_rcv.sb_cc); #endif - if (so->so_error) { + if ((error2 = READ_ONCE(so->so_error))) { if (m) goto dontblock; - error = so->so_error; + error = error2; if ((flags & MSG_PEEK) == 0) so->so_error = 0; goto release; @@ -1289,7 +1287,13 @@ sorflush_locked(struct socket *so) error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ KASSERT(error == 0); + + if (sb->sb_flags & SB_OWNLOCK) + solock(so); socantrcvmore(so); + if (sb->sb_flags & SB_OWNLOCK) + sounlock(so); + mtx_enter(&sb->sb_mtx); m = sb->sb_mb; memset(&sb->sb_startzero, 0, @@ -1323,13 +1327,17 @@ sorflush(struct socket *so) int sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) { - struct file *fp; + struct file *fp = NULL; struct socket *sosp; - struct sosplice *sp; struct taskq *tq; int error = 0; - soassertlocked(so); + if ((so->so_proto->pr_flags & PR_SPLICE) == 0) + return (EPROTONOSUPPORT); + if (max && max < 0) + return (EINVAL); + if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) + return (EINVAL); if (sosplice_taskq == NULL) { rw_enter_write(&sosplice_lock); @@ -1350,63 +1358,51 @@ sosplice(struct socket *so, int fd, off_ membar_consumer(); } - if ((so->so_proto->pr_flags & PR_SPLICE) == 0) - return (EPROTONOSUPPORT); - if (so->so_options & SO_ACCEPTCONN) - return (EOPNOTSUPP); + if (so->so_rcv.sb_flags & SB_OWNLOCK) { + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) + return (error); + solock(so); + } else { + solock(so); + if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { + sounlock(so); + return (error); + } + } + + if (so->so_options & SO_ACCEPTCONN) { + error = EOPNOTSUPP; + goto out; + } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && - (so->so_proto->pr_flags & PR_CONNREQUIRED)) - return (ENOTCONN); - if (so->so_sp == NULL) { - sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); - if (so->so_sp == NULL) - so->so_sp = sp; - else - pool_put(&sosplice_pool, sp); + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; + goto out; } + if (so->so_sp == NULL) + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); /* If no fd is given, unsplice by removing existing link. */ if (fd < 0) { - /* Lock receive buffer. */ - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { - return (error); - } if (so->so_sp->ssp_socket) sounsplice(so, so->so_sp->ssp_socket, 0); - sbunlock(so, &so->so_rcv); - return (0); + goto out; } - if (max && max < 0) - return (EINVAL); - - if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) - return (EINVAL); - /* Find sosp, the drain socket where data will be spliced into. */ if ((error = getsock(curproc, fd, &fp)) != 0) - return (error); + goto out; sosp = fp->f_data; if (sosp->so_proto->pr_usrreqs->pru_send != so->so_proto->pr_usrreqs->pru_send) { error = EPROTONOSUPPORT; - goto frele; - } - if (sosp->so_sp == NULL) { - sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); - if (sosp->so_sp == NULL) - sosp->so_sp = sp; - else - pool_put(&sosplice_pool, sp); + goto out; } + if (sosp->so_sp == NULL) + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); - /* Lock both receive and send buffer. */ - if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { - goto frele; - } if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) { - sbunlock(so, &so->so_rcv); - goto frele; + goto out; } if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { @@ -1423,8 +1419,10 @@ sosplice(struct socket *so, int fd, off_ } /* Splice so and sosp together. */ + mtx_enter(&so->so_rcv.sb_mtx); so->so_sp->ssp_socket = sosp; sosp->so_sp->ssp_soback = so; + mtx_leave(&so->so_rcv.sb_mtx); so->so_splicelen = 0; so->so_splicemax = max; if (tv) @@ -1447,17 +1445,18 @@ sosplice(struct socket *so, int fd, off_ release: sbunlock(sosp, &sosp->so_snd); - sbunlock(so, &so->so_rcv); - frele: - /* - * FRELE() must not be called with the socket lock held. It is safe to - * release the lock here as long as no other operation happen on the - * socket when sosplice() returns. The dance could be avoided by - * grabbing the socket lock inside this function. - */ - sounlock(so); - FRELE(fp, curproc); - solock(so); + out: + if (so->so_rcv.sb_flags & SB_OWNLOCK) { + sounlock(so); + sbunlock(so, &so->so_rcv); + } else { + sbunlock(so, &so->so_rcv); + sounlock(so); + } + + if (fp) + FRELE(fp, curproc); + return (error); } @@ -1469,10 +1468,12 @@ sounsplice(struct socket *so, struct soc task_del(sosplice_taskq, &so->so_splicetask); timeout_del(&so->so_idleto); sosp->so_snd.sb_flags &= ~SB_SPLICE; + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_flags &= ~SB_SPLICE; - mtx_leave(&so->so_rcv.sb_mtx); so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; + mtx_leave(&so->so_rcv.sb_mtx); + /* Do not wakeup a socket that is about to be freed. */ if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) sorwakeup(so); @@ -2025,7 +2026,6 @@ sosetopt(struct socket *so, int level, i break; #ifdef SOCKET_SPLICE case SO_SPLICE: - solock(so); if (m == NULL) { error = sosplice(so, -1, 0, NULL); } else if (m->m_len < sizeof(int)) { @@ -2038,7 +2038,6 @@ sosetopt(struct socket *so, int level, i mtod(m, struct splice *)->sp_max, &mtod(m, struct splice *)->sp_idle); } - sounlock(so); break; #endif /* SOCKET_SPLICE */

2024-04-10 19:29 Vitaliy Makkoveev:
Don't take solock() in soreceive() for SOCK_RAW inet sockets
- 2024-04-11 13:07 Alexander Bluhm:
  Don't take solock() in soreceive() for SOCK_RAW inet sockets
- - 2024-04-11 19:15 Vitaliy Makkoveev:
    Don't take solock() in soreceive() for SOCK_RAW inet sockets
  - - 2024-04-15 18:52 Alexander Bluhm:
      Don't take solock() in soreceive() for SOCK_RAW inet sockets
    - - 2024-04-15 21:40 Vitaliy Makkoveev:
        Don't take solock() in soreceive() for SOCK_RAW inet sockets