From: Vitaliy Makkoveev Subject: Use `sb_mtx' to protect `so_rcv' receive buffer of unix(4) sockets To: tech@openbsd.org Date: Sat, 23 Mar 2024 22:00:35 +0300 This makes re-locking unnecessary in the uipc_*send() paths, because it's enough to lock one socket to prevent peer from concurrent disconnection. As the little bonus unix(4) one socket can perform simultaneous transmission and reception with one exception for uipc_rcvd() which still requires the re-lock for connection oriented sockets. A few words about filt_soread() and filt_soexcept(). The socket lock is not held while they called from uipc_*send() through sorwakeup(). However, the unlocked access to the `so_options', `so_state' and `so_error' is fine. In the uipc_*send() paths the socket lock is still held on sending socket, so the receiving peer can't be disconnected. It also can't be or became listening socket, so SO_ACCEPTCONN is not set and can't be set concurrently as the SS_ISDISCONNECTED bit. The SS_ISCONNECTED bit is always set and can't be cleared concurrently. `so_error' is set on the peer sockets only by unp_detach(), which also can't be called concurrently on sending socket. This is also true for filt_fiforead() and filt_fifoexcept(). For other callers like kevent(2) or doaccept() the socket lock is still held. Index: sys/kern/sys_socket.c =================================================================== RCS file: /cvs/src/sys/kern/sys_socket.c,v retrieving revision 1.61 diff -u -p -r1.61 sys_socket.c --- sys/kern/sys_socket.c 15 Apr 2023 13:18:28 -0000 1.61 +++ sys/kern/sys_socket.c 23 Mar 2024 18:59:11 -0000 @@ -144,9 +144,11 @@ soo_stat(struct file *fp, struct stat *u memset(ub, 0, sizeof (*ub)); ub->st_mode = S_IFSOCK; solock(so); + mtx_enter(&so->so_rcv.sb_mtx); if ((so->so_rcv.sb_state & SS_CANTRCVMORE) == 0 || so->so_rcv.sb_cc != 0) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; + mtx_leave(&so->so_rcv.sb_mtx); if ((so->so_snd.sb_state & SS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; ub->st_uid = so->so_euid; Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.321 diff -u -p -r1.321 uipc_socket.c --- sys/kern/uipc_socket.c 22 Mar 2024 17:34:11 -0000 1.321 +++ sys/kern/uipc_socket.c 23 Mar 2024 18:59:11 -0000 @@ -160,6 +160,9 @@ soalloc(const struct protosw *prp, int w break; } break; + case AF_UNIX: + so->so_rcv.sb_flags |= SB_MTXLOCK; + break; } return (so); @@ -987,8 +990,11 @@ dontblock: * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ - if (pr->pr_domain->dom_dispose) + if (pr->pr_domain->dom_dispose) { + sb_mtx_unlock(&so->so_rcv); pr->pr_domain->dom_dispose(cm); + sb_mtx_lock(&so->so_rcv); + } m_free(cm); } } @@ -1173,8 +1179,11 @@ dontblock: } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); - if (pr->pr_flags & PR_WANTRCVD) + if (pr->pr_flags & PR_WANTRCVD) { + sb_mtx_unlock(&so->so_rcv); pru_rcvd(so); + sb_mtx_lock(&so->so_rcv); + } } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && @@ -1233,10 +1242,10 @@ sorflush(struct socket *so) /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ KASSERT(error == 0); socantrcvmore(so); + mtx_enter(&sb->sb_mtx); m = sb->sb_mb; memset(&sb->sb_startzero, 0, (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); - mtx_enter(&sb->sb_mtx); sb->sb_timeo_nsecs = INFSLP; mtx_leave(&sb->sb_mtx); sbunlock(so, sb); @@ -1757,7 +1766,8 @@ somove(struct socket *so, int wait) void sorwakeup(struct socket *so) { - soassertlocked_readonly(so); + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); #ifdef SOCKET_SPLICE if (so->so_rcv.sb_flags & SB_SPLICE) { @@ -1877,6 +1887,8 @@ sosetopt(struct socket *so, int level, i cnt = 1; solock(so); + mtx_enter(&sb->sb_mtx); + switch (optname) { case SO_SNDBUF: case SO_RCVBUF: @@ -1898,7 +1910,10 @@ sosetopt(struct socket *so, int level, i sb->sb_hiwat : cnt; break; } + + mtx_leave(&sb->sb_mtx); sounlock(so); + break; } @@ -2169,13 +2184,6 @@ sofilt_unlock(struct socket *so, struct } } -static inline void -sofilt_assert_locked(struct socket *so, struct sockbuf *sb) -{ - MUTEX_ASSERT_LOCKED(&sb->sb_mtx); - soassertlocked_readonly(so); -} - int soo_kqfilter(struct file *fp, struct knote *kn) { @@ -2218,9 +2226,14 @@ filt_soread(struct knote *kn, long hint) struct socket *so = kn->kn_fp->f_data; int rv = 0; - sofilt_assert_locked(so, &so->so_rcv); + MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); if (so->so_options & SO_ACCEPTCONN) { + if (so->so_rcv.sb_flags & SB_MTXLOCK) + soassertlocked_readonly(so); + kn->kn_data = so->so_qlen; rv = (kn->kn_data != 0); @@ -2275,7 +2288,8 @@ filt_sowrite(struct knote *kn, long hint struct socket *so = kn->kn_fp->f_data; int rv; - sofilt_assert_locked(so, &so->so_snd); + MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); + soassertlocked_readonly(so); kn->kn_data = sbspace(so, &so->so_snd); if (so->so_snd.sb_state & SS_CANTSENDMORE) { @@ -2306,7 +2320,9 @@ filt_soexcept(struct knote *kn, long hin struct socket *so = kn->kn_fp->f_data; int rv = 0; - sofilt_assert_locked(so, &so->so_rcv); + MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) + soassertlocked_readonly(so); #ifdef SOCKET_SPLICE if (isspliced(so)) { Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.144 diff -u -p -r1.144 uipc_socket2.c --- sys/kern/uipc_socket2.c 12 Feb 2024 22:48:27 -0000 1.144 +++ sys/kern/uipc_socket2.c 23 Mar 2024 18:59:11 -0000 @@ -142,7 +142,9 @@ soisdisconnecting(struct socket *so) soassertlocked(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; + mtx_leave(&so->so_rcv.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; wakeup(&so->so_timeo); sowwakeup(so); @@ -155,7 +157,9 @@ soisdisconnected(struct socket *so) soassertlocked(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; + mtx_leave(&so->so_rcv.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; wakeup(&so->so_timeo); sowwakeup(so); @@ -219,9 +223,10 @@ sonewconn(struct socket *head, int conns mtx_enter(&head->so_snd.sb_mtx); so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; mtx_leave(&head->so_snd.sb_mtx); + + mtx_enter(&head->so_rcv.sb_mtx); so->so_rcv.sb_wat = head->so_rcv.sb_wat; so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - mtx_enter(&head->so_rcv.sb_mtx); so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; mtx_leave(&head->so_rcv.sb_mtx); @@ -651,16 +656,22 @@ soreserve(struct socket *so, u_long sndc if (sbreserve(so, &so->so_snd, sndcc)) goto bad; - if (sbreserve(so, &so->so_rcv, rcvcc)) - goto bad2; so->so_snd.sb_wat = sndcc; - so->so_rcv.sb_wat = rcvcc; - if (so->so_rcv.sb_lowat == 0) - so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + + mtx_enter(&so->so_rcv.sb_mtx); + if (sbreserve(so, &so->so_rcv, rcvcc)) { + mtx_leave(&so->so_rcv.sb_mtx); + goto bad2; + } + so->so_rcv.sb_wat = rcvcc; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + mtx_leave(&so->so_rcv.sb_mtx); + return (0); bad2: sbrelease(so, &so->so_snd); @@ -676,8 +687,7 @@ bad: int sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) { - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - soassertlocked(so); + sbmtxassertlocked(so, sb); if (cc == 0 || cc > sb_max) return (1); @@ -818,7 +828,7 @@ sbappend(struct socket *so, struct sockb if (m == NULL) return; - soassertlocked(so); + sbmtxassertlocked(so, sb); SBLASTRECORDCHK(sb, "sbappend 1"); if ((n = sb->sb_lastrecord) != NULL) { @@ -899,8 +909,7 @@ sbappendrecord(struct socket *so, struct { struct mbuf *m; - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - soassertlocked(so); + sbmtxassertlocked(so, sb); if (m0 == NULL) return; @@ -984,6 +993,8 @@ sbappendcontrol(struct socket *so, struc struct mbuf *m, *mlast, *n; int eor = 0, space = 0; + sbmtxassertlocked(so, sb); + if (control == NULL) panic("sbappendcontrol"); for (m = control; ; m = m->m_next) { @@ -1109,8 +1120,7 @@ sbdrop(struct socket *so, struct sockbuf struct mbuf *m, *mn; struct mbuf *next; - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - soassertlocked(so); + sbmtxassertlocked(so, sb); next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.202 diff -u -p -r1.202 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 22 Mar 2024 17:34:11 -0000 1.202 +++ sys/kern/uipc_usrreq.c 23 Mar 2024 18:59:11 -0000 @@ -489,8 +489,10 @@ uipc_rcvd(struct socket *so) * Adjust backpressure on sender * and wakeup any waiting to write. */ + mtx_enter(&so->so_rcv.sb_mtx); so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; so2->so_snd.sb_cc = so->so_rcv.sb_cc; + mtx_leave(&so->so_rcv.sb_mtx); sowwakeup(so2); sounlock(so2); } @@ -499,8 +501,9 @@ int uipc_send(struct socket *so, struct mbuf *m, struct mbuf *nam, struct mbuf *control) { + struct unpcb *unp = sotounpcb(so); struct socket *so2; - int error = 0; + int error = 0, dowakeup = 0; if (control) { sounlock(so); @@ -514,21 +517,24 @@ uipc_send(struct socket *so, struct mbuf error = EPIPE; goto dispose; } - if ((so2 = unp_solock_peer(so)) == NULL) { + if (unp->unp_conn == NULL) { error = ENOTCONN; goto dispose; } + so2 = unp->unp_conn->unp_socket; + /* * Send to paired receive port, and then raise * send buffer counts to maintain backpressure. * Wake up readers. */ + mtx_enter(&so2->so_rcv.sb_mtx); if (control) { if (sbappendcontrol(so2, &so2->so_rcv, m, control)) { control = NULL; } else { - sounlock(so2); + mtx_leave(&so2->so_rcv.sb_mtx); error = ENOBUFS; goto dispose; } @@ -539,9 +545,12 @@ uipc_send(struct socket *so, struct mbuf so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt; so->so_snd.sb_cc = so2->so_rcv.sb_cc; if (so2->so_rcv.sb_cc > 0) + dowakeup = 1; + mtx_leave(&so2->so_rcv.sb_mtx); + + if (dowakeup) sorwakeup(so2); - sounlock(so2); m = NULL; dispose: @@ -563,7 +572,7 @@ uipc_dgram_send(struct socket *so, struc struct unpcb *unp = sotounpcb(so); struct socket *so2; const struct sockaddr *from; - int error = 0; + int error = 0, dowakeup = 0; if (control) { sounlock(so); @@ -583,7 +592,7 @@ uipc_dgram_send(struct socket *so, struc goto dispose; } - if ((so2 = unp_solock_peer(so)) == NULL) { + if (unp->unp_conn == NULL) { if (nam != NULL) error = ECONNREFUSED; else @@ -591,20 +600,24 @@ uipc_dgram_send(struct socket *so, struc goto dispose; } + so2 = unp->unp_conn->unp_socket; + if (unp->unp_addr) from = mtod(unp->unp_addr, struct sockaddr *); else from = &sun_noname; + + mtx_enter(&so2->so_rcv.sb_mtx); if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) { - sorwakeup(so2); + dowakeup = 1; m = NULL; control = NULL; } else error = ENOBUFS; + mtx_leave(&so2->so_rcv.sb_mtx); - if (so2 != so) - sounlock(so2); - + if (dowakeup) + sorwakeup(so2); if (nam) unp_disconnect(unp); @@ -1390,9 +1403,9 @@ unp_gc(void *arg __unused) if ((unp->unp_gcflags & UNP_GCDEAD) == 0) continue; so = unp->unp_socket; - solock(so); + mtx_enter(&so->so_rcv.sb_mtx); unp_scan(so->so_rcv.sb_mb, unp_remove_gcrefs); - sounlock(so); + mtx_leave(&so->so_rcv.sb_mtx); } /* @@ -1414,9 +1427,9 @@ unp_gc(void *arg __unused) unp->unp_gcflags &= ~UNP_GCDEAD; so = unp->unp_socket; - solock(so); + mtx_enter(&so->so_rcv.sb_mtx); unp_scan(so->so_rcv.sb_mb, unp_restore_gcrefs); - sounlock(so); + mtx_leave(&so->so_rcv.sb_mtx); KASSERT(nunref > 0); nunref--; Index: sys/miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v retrieving revision 1.103 diff -u -p -r1.103 fifo_vnops.c --- sys/miscfs/fifofs/fifo_vnops.c 3 Feb 2024 22:50:09 -0000 1.103 +++ sys/miscfs/fifofs/fifo_vnops.c 23 Mar 2024 18:59:11 -0000 @@ -201,7 +201,9 @@ fifo_open(void *v) if (fip->fi_writers == 1) { solock(rso); rso->so_state &= ~SS_ISDISCONNECTED; + mtx_enter(&rso->so_rcv.sb_mtx); rso->so_rcv.sb_state &= ~SS_CANTRCVMORE; + mtx_leave(&rso->so_rcv.sb_mtx); sounlock(rso); if (fip->fi_readers > 0) wakeup(&fip->fi_readers); @@ -518,7 +520,6 @@ filt_fiforead(struct knote *kn, long hin struct socket *so = kn->kn_hook; int rv; - soassertlocked(so); MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); kn->kn_data = so->so_rcv.sb_cc; @@ -574,7 +575,6 @@ filt_fifoexcept(struct knote *kn, long h struct socket *so = kn->kn_hook; int rv = 0; - soassertlocked(so); MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); if (kn->kn_flags & __EV_POLL) { Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.125 diff -u -p -r1.125 socketvar.h --- sys/sys/socketvar.h 22 Mar 2024 17:34:11 -0000 1.125 +++ sys/sys/socketvar.h 23 Mar 2024 18:59:11 -0000 @@ -242,7 +242,10 @@ sb_notify(struct socket *so, struct sock static inline long sbspace(struct socket *so, struct sockbuf *sb) { - soassertlocked_readonly(so); + if (sb->sb_flags & SB_MTXLOCK) + sbmtxassertlocked(so, sb); + else + soassertlocked_readonly(so); return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt); }