From: Alexander Bluhm Subject: Re: Use `sb_mtx' to protect `so_rcv' receive buffer of unix(4) sockets To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Tue, 26 Mar 2024 00:40:00 +0100 On Sat, Mar 23, 2024 at 10:00:35PM +0300, Vitaliy Makkoveev wrote: > This makes re-locking unnecessary in the uipc_*send() paths, because > it's enough to lock one socket to prevent peer from concurrent > disconnection. As the little bonus unix(4) one socket can perform > simultaneous transmission and reception with one exception for > uipc_rcvd() which still requires the re-lock for connection oriented > sockets. > > A few words about filt_soread() and filt_soexcept(). The socket lock is > not held while they called from uipc_*send() through sorwakeup(). > However, the unlocked access to the `so_options', `so_state' and > `so_error' is fine. > > In the uipc_*send() paths the socket lock is still held on sending > socket, so the receiving peer can't be disconnected. It also can't be or > became listening socket, so SO_ACCEPTCONN is not set and can't be set > concurrently as the SS_ISDISCONNECTED bit. The SS_ISCONNECTED bit is > always set and can't be cleared concurrently. `so_error' is set on the > peer sockets only by unp_detach(), which also can't be called > concurrently on sending socket. > > This is also true for filt_fiforead() and filt_fifoexcept(). For other > callers like kevent(2) or doaccept() the socket lock is still held. Passed regress with witness on i386. OK bluhm@ > Index: sys/kern/sys_socket.c > =================================================================== > RCS file: /cvs/src/sys/kern/sys_socket.c,v > retrieving revision 1.61 > diff -u -p -r1.61 sys_socket.c > --- sys/kern/sys_socket.c 15 Apr 2023 13:18:28 -0000 1.61 > +++ sys/kern/sys_socket.c 23 Mar 2024 18:59:11 -0000 > @@ -144,9 +144,11 @@ soo_stat(struct file *fp, struct stat *u > memset(ub, 0, sizeof (*ub)); > ub->st_mode = S_IFSOCK; > solock(so); > + mtx_enter(&so->so_rcv.sb_mtx); > if ((so->so_rcv.sb_state & SS_CANTRCVMORE) == 0 || > so->so_rcv.sb_cc != 0) > ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; > + mtx_leave(&so->so_rcv.sb_mtx); > if ((so->so_snd.sb_state & SS_CANTSENDMORE) == 0) > ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; > ub->st_uid = so->so_euid; > Index: sys/kern/uipc_socket.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket.c,v > retrieving revision 1.321 > diff -u -p -r1.321 uipc_socket.c > --- sys/kern/uipc_socket.c 22 Mar 2024 17:34:11 -0000 1.321 > +++ sys/kern/uipc_socket.c 23 Mar 2024 18:59:11 -0000 > @@ -160,6 +160,9 @@ soalloc(const struct protosw *prp, int w > break; > } > break; > + case AF_UNIX: > + so->so_rcv.sb_flags |= SB_MTXLOCK; > + break; > } > > return (so); > @@ -987,8 +990,11 @@ dontblock: > * Dispose of any SCM_RIGHTS message that went > * through the read path rather than recv. > */ > - if (pr->pr_domain->dom_dispose) > + if (pr->pr_domain->dom_dispose) { > + sb_mtx_unlock(&so->so_rcv); > pr->pr_domain->dom_dispose(cm); > + sb_mtx_lock(&so->so_rcv); > + } > m_free(cm); > } > } > @@ -1173,8 +1179,11 @@ dontblock: > } > SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); > SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); > - if (pr->pr_flags & PR_WANTRCVD) > + if (pr->pr_flags & PR_WANTRCVD) { > + sb_mtx_unlock(&so->so_rcv); > pru_rcvd(so); > + sb_mtx_lock(&so->so_rcv); > + } > } > if (orig_resid == uio->uio_resid && orig_resid && > (flags & MSG_EOR) == 0 && > @@ -1233,10 +1242,10 @@ sorflush(struct socket *so) > /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ > KASSERT(error == 0); > socantrcvmore(so); > + mtx_enter(&sb->sb_mtx); > m = sb->sb_mb; > memset(&sb->sb_startzero, 0, > (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); > - mtx_enter(&sb->sb_mtx); > sb->sb_timeo_nsecs = INFSLP; > mtx_leave(&sb->sb_mtx); > sbunlock(so, sb); > @@ -1757,7 +1766,8 @@ somove(struct socket *so, int wait) > void > sorwakeup(struct socket *so) > { > - soassertlocked_readonly(so); > + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) > + soassertlocked_readonly(so); > > #ifdef SOCKET_SPLICE > if (so->so_rcv.sb_flags & SB_SPLICE) { > @@ -1877,6 +1887,8 @@ sosetopt(struct socket *so, int level, i > cnt = 1; > > solock(so); > + mtx_enter(&sb->sb_mtx); > + > switch (optname) { > case SO_SNDBUF: > case SO_RCVBUF: > @@ -1898,7 +1910,10 @@ sosetopt(struct socket *so, int level, i > sb->sb_hiwat : cnt; > break; > } > + > + mtx_leave(&sb->sb_mtx); > sounlock(so); > + > break; > } > > @@ -2169,13 +2184,6 @@ sofilt_unlock(struct socket *so, struct > } > } > > -static inline void > -sofilt_assert_locked(struct socket *so, struct sockbuf *sb) > -{ > - MUTEX_ASSERT_LOCKED(&sb->sb_mtx); > - soassertlocked_readonly(so); > -} > - > int > soo_kqfilter(struct file *fp, struct knote *kn) > { > @@ -2218,9 +2226,14 @@ filt_soread(struct knote *kn, long hint) > struct socket *so = kn->kn_fp->f_data; > int rv = 0; > > - sofilt_assert_locked(so, &so->so_rcv); > + MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); > + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) > + soassertlocked_readonly(so); > > if (so->so_options & SO_ACCEPTCONN) { > + if (so->so_rcv.sb_flags & SB_MTXLOCK) > + soassertlocked_readonly(so); > + > kn->kn_data = so->so_qlen; > rv = (kn->kn_data != 0); > > @@ -2275,7 +2288,8 @@ filt_sowrite(struct knote *kn, long hint > struct socket *so = kn->kn_fp->f_data; > int rv; > > - sofilt_assert_locked(so, &so->so_snd); > + MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); > + soassertlocked_readonly(so); > > kn->kn_data = sbspace(so, &so->so_snd); > if (so->so_snd.sb_state & SS_CANTSENDMORE) { > @@ -2306,7 +2320,9 @@ filt_soexcept(struct knote *kn, long hin > struct socket *so = kn->kn_fp->f_data; > int rv = 0; > > - sofilt_assert_locked(so, &so->so_rcv); > + MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); > + if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) > + soassertlocked_readonly(so); > > #ifdef SOCKET_SPLICE > if (isspliced(so)) { > Index: sys/kern/uipc_socket2.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_socket2.c,v > retrieving revision 1.144 > diff -u -p -r1.144 uipc_socket2.c > --- sys/kern/uipc_socket2.c 12 Feb 2024 22:48:27 -0000 1.144 > +++ sys/kern/uipc_socket2.c 23 Mar 2024 18:59:11 -0000 > @@ -142,7 +142,9 @@ soisdisconnecting(struct socket *so) > soassertlocked(so); > so->so_state &= ~SS_ISCONNECTING; > so->so_state |= SS_ISDISCONNECTING; > + mtx_enter(&so->so_rcv.sb_mtx); > so->so_rcv.sb_state |= SS_CANTRCVMORE; > + mtx_leave(&so->so_rcv.sb_mtx); > so->so_snd.sb_state |= SS_CANTSENDMORE; > wakeup(&so->so_timeo); > sowwakeup(so); > @@ -155,7 +157,9 @@ soisdisconnected(struct socket *so) > soassertlocked(so); > so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); > so->so_state |= SS_ISDISCONNECTED; > + mtx_enter(&so->so_rcv.sb_mtx); > so->so_rcv.sb_state |= SS_CANTRCVMORE; > + mtx_leave(&so->so_rcv.sb_mtx); > so->so_snd.sb_state |= SS_CANTSENDMORE; > wakeup(&so->so_timeo); > sowwakeup(so); > @@ -219,9 +223,10 @@ sonewconn(struct socket *head, int conns > mtx_enter(&head->so_snd.sb_mtx); > so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; > mtx_leave(&head->so_snd.sb_mtx); > + > + mtx_enter(&head->so_rcv.sb_mtx); > so->so_rcv.sb_wat = head->so_rcv.sb_wat; > so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; > - mtx_enter(&head->so_rcv.sb_mtx); > so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; > mtx_leave(&head->so_rcv.sb_mtx); > > @@ -651,16 +656,22 @@ soreserve(struct socket *so, u_long sndc > > if (sbreserve(so, &so->so_snd, sndcc)) > goto bad; > - if (sbreserve(so, &so->so_rcv, rcvcc)) > - goto bad2; > so->so_snd.sb_wat = sndcc; > - so->so_rcv.sb_wat = rcvcc; > - if (so->so_rcv.sb_lowat == 0) > - so->so_rcv.sb_lowat = 1; > if (so->so_snd.sb_lowat == 0) > so->so_snd.sb_lowat = MCLBYTES; > if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) > so->so_snd.sb_lowat = so->so_snd.sb_hiwat; > + > + mtx_enter(&so->so_rcv.sb_mtx); > + if (sbreserve(so, &so->so_rcv, rcvcc)) { > + mtx_leave(&so->so_rcv.sb_mtx); > + goto bad2; > + } > + so->so_rcv.sb_wat = rcvcc; > + if (so->so_rcv.sb_lowat == 0) > + so->so_rcv.sb_lowat = 1; > + mtx_leave(&so->so_rcv.sb_mtx); > + > return (0); > bad2: > sbrelease(so, &so->so_snd); > @@ -676,8 +687,7 @@ bad: > int > sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) > { > - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); > - soassertlocked(so); > + sbmtxassertlocked(so, sb); > > if (cc == 0 || cc > sb_max) > return (1); > @@ -818,7 +828,7 @@ sbappend(struct socket *so, struct sockb > if (m == NULL) > return; > > - soassertlocked(so); > + sbmtxassertlocked(so, sb); > SBLASTRECORDCHK(sb, "sbappend 1"); > > if ((n = sb->sb_lastrecord) != NULL) { > @@ -899,8 +909,7 @@ sbappendrecord(struct socket *so, struct > { > struct mbuf *m; > > - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); > - soassertlocked(so); > + sbmtxassertlocked(so, sb); > > if (m0 == NULL) > return; > @@ -984,6 +993,8 @@ sbappendcontrol(struct socket *so, struc > struct mbuf *m, *mlast, *n; > int eor = 0, space = 0; > > + sbmtxassertlocked(so, sb); > + > if (control == NULL) > panic("sbappendcontrol"); > for (m = control; ; m = m->m_next) { > @@ -1109,8 +1120,7 @@ sbdrop(struct socket *so, struct sockbuf > struct mbuf *m, *mn; > struct mbuf *next; > > - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); > - soassertlocked(so); > + sbmtxassertlocked(so, sb); > > next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; > while (len > 0) { > Index: sys/kern/uipc_usrreq.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v > retrieving revision 1.202 > diff -u -p -r1.202 uipc_usrreq.c > --- sys/kern/uipc_usrreq.c 22 Mar 2024 17:34:11 -0000 1.202 > +++ sys/kern/uipc_usrreq.c 23 Mar 2024 18:59:11 -0000 > @@ -489,8 +489,10 @@ uipc_rcvd(struct socket *so) > * Adjust backpressure on sender > * and wakeup any waiting to write. > */ > + mtx_enter(&so->so_rcv.sb_mtx); > so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; > so2->so_snd.sb_cc = so->so_rcv.sb_cc; > + mtx_leave(&so->so_rcv.sb_mtx); > sowwakeup(so2); > sounlock(so2); > } > @@ -499,8 +501,9 @@ int > uipc_send(struct socket *so, struct mbuf *m, struct mbuf *nam, > struct mbuf *control) > { > + struct unpcb *unp = sotounpcb(so); > struct socket *so2; > - int error = 0; > + int error = 0, dowakeup = 0; > > if (control) { > sounlock(so); > @@ -514,21 +517,24 @@ uipc_send(struct socket *so, struct mbuf > error = EPIPE; > goto dispose; > } > - if ((so2 = unp_solock_peer(so)) == NULL) { > + if (unp->unp_conn == NULL) { > error = ENOTCONN; > goto dispose; > } > > + so2 = unp->unp_conn->unp_socket; > + > /* > * Send to paired receive port, and then raise > * send buffer counts to maintain backpressure. > * Wake up readers. > */ > + mtx_enter(&so2->so_rcv.sb_mtx); > if (control) { > if (sbappendcontrol(so2, &so2->so_rcv, m, control)) { > control = NULL; > } else { > - sounlock(so2); > + mtx_leave(&so2->so_rcv.sb_mtx); > error = ENOBUFS; > goto dispose; > } > @@ -539,9 +545,12 @@ uipc_send(struct socket *so, struct mbuf > so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt; > so->so_snd.sb_cc = so2->so_rcv.sb_cc; > if (so2->so_rcv.sb_cc > 0) > + dowakeup = 1; > + mtx_leave(&so2->so_rcv.sb_mtx); > + > + if (dowakeup) > sorwakeup(so2); > > - sounlock(so2); > m = NULL; > > dispose: > @@ -563,7 +572,7 @@ uipc_dgram_send(struct socket *so, struc > struct unpcb *unp = sotounpcb(so); > struct socket *so2; > const struct sockaddr *from; > - int error = 0; > + int error = 0, dowakeup = 0; > > if (control) { > sounlock(so); > @@ -583,7 +592,7 @@ uipc_dgram_send(struct socket *so, struc > goto dispose; > } > > - if ((so2 = unp_solock_peer(so)) == NULL) { > + if (unp->unp_conn == NULL) { > if (nam != NULL) > error = ECONNREFUSED; > else > @@ -591,20 +600,24 @@ uipc_dgram_send(struct socket *so, struc > goto dispose; > } > > + so2 = unp->unp_conn->unp_socket; > + > if (unp->unp_addr) > from = mtod(unp->unp_addr, struct sockaddr *); > else > from = &sun_noname; > + > + mtx_enter(&so2->so_rcv.sb_mtx); > if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) { > - sorwakeup(so2); > + dowakeup = 1; > m = NULL; > control = NULL; > } else > error = ENOBUFS; > + mtx_leave(&so2->so_rcv.sb_mtx); > > - if (so2 != so) > - sounlock(so2); > - > + if (dowakeup) > + sorwakeup(so2); > if (nam) > unp_disconnect(unp); > > @@ -1390,9 +1403,9 @@ unp_gc(void *arg __unused) > if ((unp->unp_gcflags & UNP_GCDEAD) == 0) > continue; > so = unp->unp_socket; > - solock(so); > + mtx_enter(&so->so_rcv.sb_mtx); > unp_scan(so->so_rcv.sb_mb, unp_remove_gcrefs); > - sounlock(so); > + mtx_leave(&so->so_rcv.sb_mtx); > } > > /* > @@ -1414,9 +1427,9 @@ unp_gc(void *arg __unused) > unp->unp_gcflags &= ~UNP_GCDEAD; > > so = unp->unp_socket; > - solock(so); > + mtx_enter(&so->so_rcv.sb_mtx); > unp_scan(so->so_rcv.sb_mb, unp_restore_gcrefs); > - sounlock(so); > + mtx_leave(&so->so_rcv.sb_mtx); > > KASSERT(nunref > 0); > nunref--; > Index: sys/miscfs/fifofs/fifo_vnops.c > =================================================================== > RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v > retrieving revision 1.103 > diff -u -p -r1.103 fifo_vnops.c > --- sys/miscfs/fifofs/fifo_vnops.c 3 Feb 2024 22:50:09 -0000 1.103 > +++ sys/miscfs/fifofs/fifo_vnops.c 23 Mar 2024 18:59:11 -0000 > @@ -201,7 +201,9 @@ fifo_open(void *v) > if (fip->fi_writers == 1) { > solock(rso); > rso->so_state &= ~SS_ISDISCONNECTED; > + mtx_enter(&rso->so_rcv.sb_mtx); > rso->so_rcv.sb_state &= ~SS_CANTRCVMORE; > + mtx_leave(&rso->so_rcv.sb_mtx); > sounlock(rso); > if (fip->fi_readers > 0) > wakeup(&fip->fi_readers); > @@ -518,7 +520,6 @@ filt_fiforead(struct knote *kn, long hin > struct socket *so = kn->kn_hook; > int rv; > > - soassertlocked(so); > MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); > > kn->kn_data = so->so_rcv.sb_cc; > @@ -574,7 +575,6 @@ filt_fifoexcept(struct knote *kn, long h > struct socket *so = kn->kn_hook; > int rv = 0; > > - soassertlocked(so); > MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); > > if (kn->kn_flags & __EV_POLL) { > Index: sys/sys/socketvar.h > =================================================================== > RCS file: /cvs/src/sys/sys/socketvar.h,v > retrieving revision 1.125 > diff -u -p -r1.125 socketvar.h > --- sys/sys/socketvar.h 22 Mar 2024 17:34:11 -0000 1.125 > +++ sys/sys/socketvar.h 23 Mar 2024 18:59:11 -0000 > @@ -242,7 +242,10 @@ sb_notify(struct socket *so, struct sock > static inline long > sbspace(struct socket *so, struct sockbuf *sb) > { > - soassertlocked_readonly(so); > + if (sb->sb_flags & SB_MTXLOCK) > + sbmtxassertlocked(so, sb); > + else > + soassertlocked_readonly(so); > > return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt); > }