From: Vitaliy Makkoveev Subject: Don't take solock() in sosend() for SOCK_RAW sockets To: tech@openbsd.org Date: Fri, 19 Apr 2024 22:39:58 +0300 Mostly inspired by udp(4) sockets. Parallel udp(4) input blocked by somove() which requires to protect not only `so_rcv', but also `so_snd' of spliced peer. However, for SOCK_DGRAM sockets, `sb_mtx' looks pretty enough to serialize somove() and sosend(). Raw sockets are the simplest inet sockets, so I want to use them to start landing `sb_mtx' to protect `so_snd' buffer. Now solock() is taken only around pru_send*(), the rest of sosend() serialized by sblock() and `sb_mtx' mutex(9). The unlocked SS_ISCONNECTED checks looks fine, because rip{,6}_send() check it. Also, the SS_ISCONNECTED could be lost due to solock() release around following m_getuio(). The profit of unlocked sosend() could be not obvious, but let's assume it as pushing netlock deeper to make some checks and sleep arond `so_snd' lockless. Also, for unix(4) lockless sosend() removes re-locking from uipc_rcvd() in soreceive() path. I have the same diff for udp(4) sockets, but I want to do somove() refactoring separately. Index: sys/kern/sys_socket.c =================================================================== RCS file: /cvs/src/sys/kern/sys_socket.c,v retrieving revision 1.64 diff -u -p -r1.64 sys_socket.c --- sys/kern/sys_socket.c 11 Apr 2024 08:33:37 -0000 1.64 +++ sys/kern/sys_socket.c 19 Apr 2024 19:19:44 -0000 @@ -92,6 +92,7 @@ soo_ioctl(struct file *fp, u_long cmd, c case FIOASYNC: solock(so); mtx_enter(&so->so_rcv.sb_mtx); + mtx_enter(&so->so_snd.sb_mtx); if (*(int *)data) { so->so_rcv.sb_flags |= SB_ASYNC; so->so_snd.sb_flags |= SB_ASYNC; @@ -99,6 +100,7 @@ soo_ioctl(struct file *fp, u_long cmd, c so->so_rcv.sb_flags &= ~SB_ASYNC; so->so_snd.sb_flags &= ~SB_ASYNC; } + mtx_leave(&so->so_snd.sb_mtx); mtx_leave(&so->so_rcv.sb_mtx); sounlock(so); break; Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.330 diff -u -p -r1.330 uipc_socket.c --- sys/kern/uipc_socket.c 15 Apr 2024 21:31:29 -0000 1.330 +++ sys/kern/uipc_socket.c 19 Apr 2024 19:19:44 -0000 @@ -146,8 +146,8 @@ soalloc(const struct protosw *prp, int w refcnt_init(&so->so_refcnt); rw_init(&so->so_rcv.sb_lock, "sbufrcv"); rw_init(&so->so_snd.sb_lock, "sbufsnd"); - mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR); - mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR); + mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); + mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); sigio_init(&so->so_sigio); @@ -158,8 +158,10 @@ soalloc(const struct protosw *prp, int w case AF_INET: case AF_INET6: switch (prp->pr_type) { - case SOCK_DGRAM: case SOCK_RAW: + so->so_snd.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; + /* FALLTHROUGH */ + case SOCK_DGRAM: so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; break; } @@ -346,7 +348,10 @@ sofree(struct socket *so, int keep_lock) sounsplice(so, so->so_sp->ssp_socket, freeing); } #endif /* SOCKET_SPLICE */ + + mtx_enter(&so->so_snd.sb_mtx); sbrelease(so, &so->so_snd); + mtx_leave(&so->so_snd.sb_mtx); /* * Regardless on '_locked' postfix, must release solock() before @@ -569,6 +574,7 @@ sosend(struct socket *so, struct mbuf *a size_t resid; int error; int atomic = sosendallatonce(so) || top; + int dosolock = ((so->so_snd.sb_flags & SB_OWNLOCK) == 0); if (uio) resid = uio->uio_resid; @@ -601,16 +607,17 @@ sosend(struct socket *so, struct mbuf *a #define snderr(errno) { error = errno; goto release; } - solock_shared(so); + if (dosolock) + solock_shared(so); restart: if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; + sb_mtx_lock(&so->so_snd); so->so_snd.sb_state |= SS_ISSENDING; do { if (so->so_snd.sb_state & SS_CANTSENDMORE) snderr(EPIPE); - if (so->so_error) { - error = so->so_error; + if ((error = READ_ONCE(so->so_error))) { so->so_error = 0; snderr(error); } @@ -638,8 +645,14 @@ restart: if (flags & MSG_DONTWAIT) snderr(EWOULDBLOCK); sbunlock(so, &so->so_snd); - error = sbwait(so, &so->so_snd); + + if (so->so_snd.sb_flags & SB_OWNLOCK) + error = sbwait_locked(so, &so->so_snd); + else + error = sbwait(so, &so->so_snd); + so->so_snd.sb_state &= ~SS_ISSENDING; + sb_mtx_unlock(&so->so_snd); if (error) goto out; goto restart; @@ -654,9 +667,11 @@ restart: if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { - sounlock_shared(so); + if (dosolock) + sounlock_shared(so); error = m_getuio(&top, atomic, space, uio); - solock_shared(so); + if (dosolock) + solock_shared(so); if (error) goto release; space -= top->m_pkthdr.len; @@ -668,10 +683,18 @@ restart: so->so_snd.sb_state &= ~SS_ISSENDING; if (top && so->so_options & SO_ZEROIZE) top->m_flags |= M_ZEROIZE; + if (!dosolock) { + mtx_leave(&so->so_snd.sb_mtx); + solock_shared(so); + } if (flags & MSG_OOB) error = pru_sendoob(so, top, addr, control); else error = pru_send(so, top, addr, control); + if (!dosolock) { + sounlock_shared(so); + mtx_enter(&so->so_snd.sb_mtx); + } clen = 0; control = NULL; top = NULL; @@ -682,9 +705,11 @@ restart: release: so->so_snd.sb_state &= ~SS_ISSENDING; + sb_mtx_unlock(&so->so_snd); sbunlock(so, &so->so_snd); out: - sounlock_shared(so); + if (dosolock) + sounlock_shared(so); m_freem(top); m_freem(control); return (error); Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.149 diff -u -p -r1.149 uipc_socket2.c --- sys/kern/uipc_socket2.c 11 Apr 2024 13:32:51 -0000 1.149 +++ sys/kern/uipc_socket2.c 19 Apr 2024 19:19:44 -0000 @@ -142,10 +142,15 @@ soisdisconnecting(struct socket *so) soassertlocked(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; mtx_leave(&so->so_rcv.sb_mtx); + + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; + mtx_leave(&so->so_snd.sb_mtx); + wakeup(&so->so_timeo); sowwakeup(so); sorwakeup(so); @@ -157,10 +162,15 @@ soisdisconnected(struct socket *so) soassertlocked(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; mtx_leave(&so->so_rcv.sb_mtx); + + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; + mtx_leave(&so->so_snd.sb_mtx); + wakeup(&so->so_timeo); sowwakeup(so); sorwakeup(so); @@ -315,7 +325,9 @@ void socantsendmore(struct socket *so) { soassertlocked(so); + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; + mtx_leave(&so->so_snd.sb_mtx); sowwakeup(so); } @@ -666,6 +678,8 @@ soreserve(struct socket *so, u_long sndc { soassertlocked(so); + mtx_enter(&so->so_rcv.sb_mtx); + mtx_enter(&so->so_snd.sb_mtx); if (sbreserve(so, &so->so_snd, sndcc)) goto bad; so->so_snd.sb_wat = sndcc; @@ -673,8 +687,6 @@ soreserve(struct socket *so, u_long sndc so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; - - mtx_enter(&so->so_rcv.sb_mtx); if (sbreserve(so, &so->so_rcv, rcvcc)) { mtx_leave(&so->so_rcv.sb_mtx); goto bad2; @@ -682,12 +694,15 @@ soreserve(struct socket *so, u_long sndc so->so_rcv.sb_wat = rcvcc; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; + mtx_leave(&so->so_snd.sb_mtx); mtx_leave(&so->so_rcv.sb_mtx); return (0); bad2: sbrelease(so, &so->so_snd); bad: + mtx_leave(&so->so_snd.sb_mtx); + mtx_leave(&so->so_rcv.sb_mtx); return (ENOBUFS); }