From: Vitaliy Makkoveev Subject: Re: tcp(4): use per-sockbuf mutex to protect `so_snd' socket buffer To: Alexander Bluhm Cc: OpenBSD tech Date: Tue, 24 Dec 2024 22:21:55 +0300 On Tue, Dec 24, 2024 at 08:53:14PM +0300, Vitaliy Makkoveev wrote: > On Tue, Dec 24, 2024 at 07:59:38PM +0300, Vitaliy Makkoveev wrote: > > > On 24 Dec 2024, at 19:49, Alexander Bluhm wrote: > > > > > > On Sat, Dec 21, 2024 at 08:16:05AM +0300, Vitaliy Makkoveev wrote: > > >> This diff only unlocks sosend() path, all the rest is still locked > > >> exclusively. > > >> > > >> Even for tcp(4) case, sosend() only checks `so_snd' free space and > > >> sleeps if necessary, actual buffer handling happening within exclusively > > >> locked PCB layer. In the PCB layer we don't need to protect read-only > > >> access to `so_snd' because corresponding read-write access is serialized > > >> by socket lock. > > >> > > >> sosend() needs to be serialized with somove(), so sotask() takes > > >> sblock() on `so_snd' of spliced socket. This discards previous > > >> protection of tcp(4) spliced sockets where solock() was used to prevent > > >> concurrent unsplicing. This scheme was used to avoid sleep in sofree(). > > >> > > >> However, sleep is sofree() is possible. We have two cases: > > >> > > >> 1. Socket was not yet accept(2)ed. Such sockets can't be accessed from > > >> the userland and can't be spliced. sofree() could be called only from > > >> PCB layer and don't need to sleep. > > >> > > >> 2. Socket was accepted. While called form PCB layer, sofree() ignores it > > >> because SS_NOFDREF bit is not set. Socket remains spliced, but without > > >> PCB. Sockets without PCB can't be accessed from PCB layer, only from > > >> userland. So, soclose()/sofree() thread needs to wait concurrent > > >> soclose()/sofree() thread for spliced socket as is is already done for > > >> udp(4) case. In such case it is safe to release solock() and sleep > > >> within sofree(). > > >> > > >> Note, I intentionally left all "if (dosolock)" dances as is. > > > > > > While running your diff though my performnce tests, I got this panic > > > twice. I have never seen it before, so I guess it is related. > > > > > > panic: softclock: invalid to_clock: 19668320 > > > Stopped at db_enter+0x14: popq %rbp > > > TID PID UID PRFLAGS PFLAGS CPU COMMAND > > > 120146 91986 89 0x1100012 0 1 relayd > > > 65124 13759 89 0x1100012 0 3 relayd > > > 33160 19438 0 0x14000 0x200 2 softnet0 > > > db_enter() at db_enter+0x14 > > > panic(ffffffff82883be3) at panic+0xdd > > > softclock(0) at softclock+0x1d5 > > > softintr_dispatch(0) at softintr_dispatch+0xe6 > > > Xsoftclock() at Xsoftclock+0x27 > > > acpicpu_idle() at acpicpu_idle+0x2b9 > > > sched_idle(ffffffff83593ff0) at sched_idle+0x298 > > > end trace frame: 0x0, count: 8 > > > https://www.openbsd.org/ddb.html describes the minimum info required in bug > > > reports. Insufficient info makes it difficult to find and fix bugs. > > > > > > ddb{0}> x/s version > > > version: OpenBSD 7.6-current (GENERIC.MP) #cvs : D2024.12.23.00.00.00: Tue Dec 24 14:53:39 CET 2024 > > > root@ot15.obsd-lab.genua.de:/usr/src/sys/arch/amd64/compile/GENERIC.MP > > > > > > ddb{0}> show panic > > > *cpu0: softclock: invalid to_clock: 19668320 > > > > Interesting. Which test triggers this? I suspect sosplice. > > This diff contains only sosplice() relared hunks. Regardless `so_snd' > unlocking, our current sosend() releases solock() around uiomove(), so > concurrent somove() thread could break us. This diff prevents this by > taking sblock() on `so_snd' of spliced socket. > > I performed couple of kern/sosplice runs without any issue. Alexander, > could you test it too? I suspect the problem could be in timeout > reinitialisation in sofree(), but I can't trigger it during regress run. > This is the updated diff. It uses barriers instead of `ssp_idleto' timeout and `ssp_task' task redefinition. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.347 uipc_socket.c --- sys/kern/uipc_socket.c 19 Dec 2024 22:11:35 -0000 1.347 +++ sys/kern/uipc_socket.c 24 Dec 2024 19:12:56 -0000 @@ -62,8 +62,6 @@ int sosplice(struct socket *, int, off_t void sounsplice(struct socket *, struct socket *, int); void soidle(void *); void sotask(void *); -void soreaper(void *); -void soput(void *); int somove(struct socket *, int); void sorflush(struct socket *); @@ -327,17 +325,14 @@ sofree(struct socket *so, int keep_lock) sounlock(head); } - switch (so->so_proto->pr_domain->dom_family) { - case AF_INET: - case AF_INET6: - if (so->so_proto->pr_type == SOCK_STREAM) - break; - /* FALLTHROUGH */ - default: + if (!keep_lock) { + /* + * sofree() was called from soclose(). Sleep is safe + * even for tcp(4) sockets. + */ sounlock(so); refcnt_finalize(&so->so_refcnt, "sofinal"); solock(so); - break; } sigio_free(&so->so_sigio); @@ -358,20 +353,20 @@ sofree(struct socket *so, int keep_lock) (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); m_purge(so->so_rcv.sb_mb); - if (!keep_lock) + if (!keep_lock) { sounlock(so); #ifdef SOCKET_SPLICE - if (so->so_sp) { - /* Reuse splice idle, sounsplice() has been called before. */ - timeout_set_flags(&so->so_sp->ssp_idleto, soreaper, so, - KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); - timeout_add(&so->so_sp->ssp_idleto, 0); - } else + if (so->so_sp) { + timeout_del_barrier(&so->so_sp->ssp_idleto); + task_del(sosplice_taskq, &so->so_sp->ssp_task); + taskq_barrier(sosplice_taskq); + pool_put(&sosplice_pool, so->so_sp); + } #endif /* SOCKET_SPLICE */ - { - pool_put(&socket_pool, so); } + + pool_put(&socket_pool, so); } static inline uint64_t @@ -450,39 +445,10 @@ drop: } } discard: - if (so->so_state & SS_NOFDREF) - panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); - so->so_state |= SS_NOFDREF; - #ifdef SOCKET_SPLICE if (so->so_sp) { struct socket *soback; - if (so->so_proto->pr_flags & PR_WANTRCVD) { - /* - * Copy - Paste, but can't relock and sleep in - * sofree() in tcp(4) case. That's why tcp(4) - * still rely on solock() for splicing and - * unsplicing. - */ - - if (issplicedback(so)) { - int freeing = SOSP_FREEING_WRITE; - - if (so->so_sp->ssp_soback == so) - freeing |= SOSP_FREEING_READ; - sounsplice(so->so_sp->ssp_soback, so, freeing); - } - if (isspliced(so)) { - int freeing = SOSP_FREEING_READ; - - if (so == so->so_sp->ssp_socket) - freeing |= SOSP_FREEING_WRITE; - sounsplice(so, so->so_sp->ssp_socket, freeing); - } - goto free; - } - sounlock(so); mtx_enter(&so->so_snd.sb_mtx); /* @@ -530,8 +496,12 @@ notsplicedback: solock(so); } -free: #endif /* SOCKET_SPLICE */ + + if (so->so_state & SS_NOFDREF) + panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); + so->so_state |= SS_NOFDREF; + /* sofree() calls sounlock(). */ sofree(so, 0); return (error); @@ -1532,8 +1502,7 @@ sosplice(struct socket *so, int fd, off_ void sounsplice(struct socket *so, struct socket *sosp, int freeing) { - if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) - sbassertlocked(&so->so_rcv); + sbassertlocked(&so->so_rcv); soassertlocked(so); task_del(sosplice_taskq, &so->so_splicetask); @@ -1587,17 +1556,23 @@ sotask(void *arg) */ sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); - if (sockstream) - solock(so); - if (so->so_rcv.sb_flags & SB_SPLICE) { - if (sockstream) + struct socket *sosp = so->so_sp->ssp_socket; + + if (sockstream) { + sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR); + solock(so); doyield = 1; + } + somove(so, M_DONTWAIT); + + if (sockstream) { + sounlock(so); + sbunlock(&sosp->so_snd); + } } - if (sockstream) - sounlock(so); sbunlock(&so->so_rcv); if (doyield) { @@ -1607,32 +1582,6 @@ sotask(void *arg) } /* - * The socket splicing task or idle timeout may sleep while grabbing the net - * lock. As sofree() can be called anytime, sotask() or soidle() could access - * the socket memory of a freed socket after wakeup. So delay the pool_put() - * after all pending socket splicing tasks or timeouts have finished. Do this - * by scheduling it on the same threads. - */ -void -soreaper(void *arg) -{ - struct socket *so = arg; - - /* Reuse splice task, sounsplice() has been called before. */ - task_set(&so->so_sp->ssp_task, soput, so); - task_add(sosplice_taskq, &so->so_sp->ssp_task); -} - -void -soput(void *arg) -{ - struct socket *so = arg; - - pool_put(&sosplice_pool, so->so_sp); - pool_put(&socket_pool, so); -} - -/* * Move data from receive buffer of spliced source socket to send * buffer of drain socket. Try to move as much as possible in one * big chunk. It is a TCP only implementation. @@ -1652,8 +1601,10 @@ somove(struct socket *so, int wait) if (sockdgram) sbassertlocked(&so->so_rcv); - else + else { + sbassertlocked(&sosp->so_snd); soassertlocked(so); + } mtx_enter(&so->so_rcv.sb_mtx); mtx_enter(&sosp->so_snd.sb_mtx);