Download raw body.
Use `sb_mtx' to protect `so_rcv' receive buffer of unix(4) sockets
This makes re-locking unnecessary in the uipc_*send() paths, because
it's enough to lock one socket to prevent peer from concurrent
disconnection. As the little bonus unix(4) one socket can perform
simultaneous transmission and reception with one exception for
uipc_rcvd() which still requires the re-lock for connection oriented
sockets.
A few words about filt_soread() and filt_soexcept(). The socket lock is
not held while they called from uipc_*send() through sorwakeup().
However, the unlocked access to the `so_options', `so_state' and
`so_error' is fine.
In the uipc_*send() paths the socket lock is still held on sending
socket, so the receiving peer can't be disconnected. It also can't be or
became listening socket, so SO_ACCEPTCONN is not set and can't be set
concurrently as the SS_ISDISCONNECTED bit. The SS_ISCONNECTED bit is
always set and can't be cleared concurrently. `so_error' is set on the
peer sockets only by unp_detach(), which also can't be called
concurrently on sending socket.
This is also true for filt_fiforead() and filt_fifoexcept(). For other
callers like kevent(2) or doaccept() the socket lock is still held.
Index: sys/kern/sys_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.61
diff -u -p -r1.61 sys_socket.c
--- sys/kern/sys_socket.c 15 Apr 2023 13:18:28 -0000 1.61
+++ sys/kern/sys_socket.c 23 Mar 2024 18:59:11 -0000
@@ -144,9 +144,11 @@ soo_stat(struct file *fp, struct stat *u
memset(ub, 0, sizeof (*ub));
ub->st_mode = S_IFSOCK;
solock(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
if ((so->so_rcv.sb_state & SS_CANTRCVMORE) == 0 ||
so->so_rcv.sb_cc != 0)
ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+ mtx_leave(&so->so_rcv.sb_mtx);
if ((so->so_snd.sb_state & SS_CANTSENDMORE) == 0)
ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
ub->st_uid = so->so_euid;
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.321
diff -u -p -r1.321 uipc_socket.c
--- sys/kern/uipc_socket.c 22 Mar 2024 17:34:11 -0000 1.321
+++ sys/kern/uipc_socket.c 23 Mar 2024 18:59:11 -0000
@@ -160,6 +160,9 @@ soalloc(const struct protosw *prp, int w
break;
}
break;
+ case AF_UNIX:
+ so->so_rcv.sb_flags |= SB_MTXLOCK;
+ break;
}
return (so);
@@ -987,8 +990,11 @@ dontblock:
* Dispose of any SCM_RIGHTS message that went
* through the read path rather than recv.
*/
- if (pr->pr_domain->dom_dispose)
+ if (pr->pr_domain->dom_dispose) {
+ sb_mtx_unlock(&so->so_rcv);
pr->pr_domain->dom_dispose(cm);
+ sb_mtx_lock(&so->so_rcv);
+ }
m_free(cm);
}
}
@@ -1173,8 +1179,11 @@ dontblock:
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
- if (pr->pr_flags & PR_WANTRCVD)
+ if (pr->pr_flags & PR_WANTRCVD) {
+ sb_mtx_unlock(&so->so_rcv);
pru_rcvd(so);
+ sb_mtx_lock(&so->so_rcv);
+ }
}
if (orig_resid == uio->uio_resid && orig_resid &&
(flags & MSG_EOR) == 0 &&
@@ -1233,10 +1242,10 @@ sorflush(struct socket *so)
/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
KASSERT(error == 0);
socantrcvmore(so);
+ mtx_enter(&sb->sb_mtx);
m = sb->sb_mb;
memset(&sb->sb_startzero, 0,
(caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
- mtx_enter(&sb->sb_mtx);
sb->sb_timeo_nsecs = INFSLP;
mtx_leave(&sb->sb_mtx);
sbunlock(so, sb);
@@ -1757,7 +1766,8 @@ somove(struct socket *so, int wait)
void
sorwakeup(struct socket *so)
{
- soassertlocked_readonly(so);
+ if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
+ soassertlocked_readonly(so);
#ifdef SOCKET_SPLICE
if (so->so_rcv.sb_flags & SB_SPLICE) {
@@ -1877,6 +1887,8 @@ sosetopt(struct socket *so, int level, i
cnt = 1;
solock(so);
+ mtx_enter(&sb->sb_mtx);
+
switch (optname) {
case SO_SNDBUF:
case SO_RCVBUF:
@@ -1898,7 +1910,10 @@ sosetopt(struct socket *so, int level, i
sb->sb_hiwat : cnt;
break;
}
+
+ mtx_leave(&sb->sb_mtx);
sounlock(so);
+
break;
}
@@ -2169,13 +2184,6 @@ sofilt_unlock(struct socket *so, struct
}
}
-static inline void
-sofilt_assert_locked(struct socket *so, struct sockbuf *sb)
-{
- MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
- soassertlocked_readonly(so);
-}
-
int
soo_kqfilter(struct file *fp, struct knote *kn)
{
@@ -2218,9 +2226,14 @@ filt_soread(struct knote *kn, long hint)
struct socket *so = kn->kn_fp->f_data;
int rv = 0;
- sofilt_assert_locked(so, &so->so_rcv);
+ MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
+ if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
+ soassertlocked_readonly(so);
if (so->so_options & SO_ACCEPTCONN) {
+ if (so->so_rcv.sb_flags & SB_MTXLOCK)
+ soassertlocked_readonly(so);
+
kn->kn_data = so->so_qlen;
rv = (kn->kn_data != 0);
@@ -2275,7 +2288,8 @@ filt_sowrite(struct knote *kn, long hint
struct socket *so = kn->kn_fp->f_data;
int rv;
- sofilt_assert_locked(so, &so->so_snd);
+ MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
+ soassertlocked_readonly(so);
kn->kn_data = sbspace(so, &so->so_snd);
if (so->so_snd.sb_state & SS_CANTSENDMORE) {
@@ -2306,7 +2320,9 @@ filt_soexcept(struct knote *kn, long hin
struct socket *so = kn->kn_fp->f_data;
int rv = 0;
- sofilt_assert_locked(so, &so->so_rcv);
+ MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
+ if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
+ soassertlocked_readonly(so);
#ifdef SOCKET_SPLICE
if (isspliced(so)) {
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.144
diff -u -p -r1.144 uipc_socket2.c
--- sys/kern/uipc_socket2.c 12 Feb 2024 22:48:27 -0000 1.144
+++ sys/kern/uipc_socket2.c 23 Mar 2024 18:59:11 -0000
@@ -142,7 +142,9 @@ soisdisconnecting(struct socket *so)
soassertlocked(so);
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= SS_ISDISCONNECTING;
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
+ mtx_leave(&so->so_rcv.sb_mtx);
so->so_snd.sb_state |= SS_CANTSENDMORE;
wakeup(&so->so_timeo);
sowwakeup(so);
@@ -155,7 +157,9 @@ soisdisconnected(struct socket *so)
soassertlocked(so);
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISDISCONNECTED;
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
+ mtx_leave(&so->so_rcv.sb_mtx);
so->so_snd.sb_state |= SS_CANTSENDMORE;
wakeup(&so->so_timeo);
sowwakeup(so);
@@ -219,9 +223,10 @@ sonewconn(struct socket *head, int conns
mtx_enter(&head->so_snd.sb_mtx);
so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
mtx_leave(&head->so_snd.sb_mtx);
+
+ mtx_enter(&head->so_rcv.sb_mtx);
so->so_rcv.sb_wat = head->so_rcv.sb_wat;
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
- mtx_enter(&head->so_rcv.sb_mtx);
so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
mtx_leave(&head->so_rcv.sb_mtx);
@@ -651,16 +656,22 @@ soreserve(struct socket *so, u_long sndc
if (sbreserve(so, &so->so_snd, sndcc))
goto bad;
- if (sbreserve(so, &so->so_rcv, rcvcc))
- goto bad2;
so->so_snd.sb_wat = sndcc;
- so->so_rcv.sb_wat = rcvcc;
- if (so->so_rcv.sb_lowat == 0)
- so->so_rcv.sb_lowat = 1;
if (so->so_snd.sb_lowat == 0)
so->so_snd.sb_lowat = MCLBYTES;
if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+
+ mtx_enter(&so->so_rcv.sb_mtx);
+ if (sbreserve(so, &so->so_rcv, rcvcc)) {
+ mtx_leave(&so->so_rcv.sb_mtx);
+ goto bad2;
+ }
+ so->so_rcv.sb_wat = rcvcc;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ mtx_leave(&so->so_rcv.sb_mtx);
+
return (0);
bad2:
sbrelease(so, &so->so_snd);
@@ -676,8 +687,7 @@ bad:
int
sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
{
- KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
- soassertlocked(so);
+ sbmtxassertlocked(so, sb);
if (cc == 0 || cc > sb_max)
return (1);
@@ -818,7 +828,7 @@ sbappend(struct socket *so, struct sockb
if (m == NULL)
return;
- soassertlocked(so);
+ sbmtxassertlocked(so, sb);
SBLASTRECORDCHK(sb, "sbappend 1");
if ((n = sb->sb_lastrecord) != NULL) {
@@ -899,8 +909,7 @@ sbappendrecord(struct socket *so, struct
{
struct mbuf *m;
- KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
- soassertlocked(so);
+ sbmtxassertlocked(so, sb);
if (m0 == NULL)
return;
@@ -984,6 +993,8 @@ sbappendcontrol(struct socket *so, struc
struct mbuf *m, *mlast, *n;
int eor = 0, space = 0;
+ sbmtxassertlocked(so, sb);
+
if (control == NULL)
panic("sbappendcontrol");
for (m = control; ; m = m->m_next) {
@@ -1109,8 +1120,7 @@ sbdrop(struct socket *so, struct sockbuf
struct mbuf *m, *mn;
struct mbuf *next;
- KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
- soassertlocked(so);
+ sbmtxassertlocked(so, sb);
next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
while (len > 0) {
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.202
diff -u -p -r1.202 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 22 Mar 2024 17:34:11 -0000 1.202
+++ sys/kern/uipc_usrreq.c 23 Mar 2024 18:59:11 -0000
@@ -489,8 +489,10 @@ uipc_rcvd(struct socket *so)
* Adjust backpressure on sender
* and wakeup any waiting to write.
*/
+ mtx_enter(&so->so_rcv.sb_mtx);
so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
so2->so_snd.sb_cc = so->so_rcv.sb_cc;
+ mtx_leave(&so->so_rcv.sb_mtx);
sowwakeup(so2);
sounlock(so2);
}
@@ -499,8 +501,9 @@ int
uipc_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
struct mbuf *control)
{
+ struct unpcb *unp = sotounpcb(so);
struct socket *so2;
- int error = 0;
+ int error = 0, dowakeup = 0;
if (control) {
sounlock(so);
@@ -514,21 +517,24 @@ uipc_send(struct socket *so, struct mbuf
error = EPIPE;
goto dispose;
}
- if ((so2 = unp_solock_peer(so)) == NULL) {
+ if (unp->unp_conn == NULL) {
error = ENOTCONN;
goto dispose;
}
+ so2 = unp->unp_conn->unp_socket;
+
/*
* Send to paired receive port, and then raise
* send buffer counts to maintain backpressure.
* Wake up readers.
*/
+ mtx_enter(&so2->so_rcv.sb_mtx);
if (control) {
if (sbappendcontrol(so2, &so2->so_rcv, m, control)) {
control = NULL;
} else {
- sounlock(so2);
+ mtx_leave(&so2->so_rcv.sb_mtx);
error = ENOBUFS;
goto dispose;
}
@@ -539,9 +545,12 @@ uipc_send(struct socket *so, struct mbuf
so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
so->so_snd.sb_cc = so2->so_rcv.sb_cc;
if (so2->so_rcv.sb_cc > 0)
+ dowakeup = 1;
+ mtx_leave(&so2->so_rcv.sb_mtx);
+
+ if (dowakeup)
sorwakeup(so2);
- sounlock(so2);
m = NULL;
dispose:
@@ -563,7 +572,7 @@ uipc_dgram_send(struct socket *so, struc
struct unpcb *unp = sotounpcb(so);
struct socket *so2;
const struct sockaddr *from;
- int error = 0;
+ int error = 0, dowakeup = 0;
if (control) {
sounlock(so);
@@ -583,7 +592,7 @@ uipc_dgram_send(struct socket *so, struc
goto dispose;
}
- if ((so2 = unp_solock_peer(so)) == NULL) {
+ if (unp->unp_conn == NULL) {
if (nam != NULL)
error = ECONNREFUSED;
else
@@ -591,20 +600,24 @@ uipc_dgram_send(struct socket *so, struc
goto dispose;
}
+ so2 = unp->unp_conn->unp_socket;
+
if (unp->unp_addr)
from = mtod(unp->unp_addr, struct sockaddr *);
else
from = &sun_noname;
+
+ mtx_enter(&so2->so_rcv.sb_mtx);
if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
- sorwakeup(so2);
+ dowakeup = 1;
m = NULL;
control = NULL;
} else
error = ENOBUFS;
+ mtx_leave(&so2->so_rcv.sb_mtx);
- if (so2 != so)
- sounlock(so2);
-
+ if (dowakeup)
+ sorwakeup(so2);
if (nam)
unp_disconnect(unp);
@@ -1390,9 +1403,9 @@ unp_gc(void *arg __unused)
if ((unp->unp_gcflags & UNP_GCDEAD) == 0)
continue;
so = unp->unp_socket;
- solock(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
unp_scan(so->so_rcv.sb_mb, unp_remove_gcrefs);
- sounlock(so);
+ mtx_leave(&so->so_rcv.sb_mtx);
}
/*
@@ -1414,9 +1427,9 @@ unp_gc(void *arg __unused)
unp->unp_gcflags &= ~UNP_GCDEAD;
so = unp->unp_socket;
- solock(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
unp_scan(so->so_rcv.sb_mb, unp_restore_gcrefs);
- sounlock(so);
+ mtx_leave(&so->so_rcv.sb_mtx);
KASSERT(nunref > 0);
nunref--;
Index: sys/miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.103
diff -u -p -r1.103 fifo_vnops.c
--- sys/miscfs/fifofs/fifo_vnops.c 3 Feb 2024 22:50:09 -0000 1.103
+++ sys/miscfs/fifofs/fifo_vnops.c 23 Mar 2024 18:59:11 -0000
@@ -201,7 +201,9 @@ fifo_open(void *v)
if (fip->fi_writers == 1) {
solock(rso);
rso->so_state &= ~SS_ISDISCONNECTED;
+ mtx_enter(&rso->so_rcv.sb_mtx);
rso->so_rcv.sb_state &= ~SS_CANTRCVMORE;
+ mtx_leave(&rso->so_rcv.sb_mtx);
sounlock(rso);
if (fip->fi_readers > 0)
wakeup(&fip->fi_readers);
@@ -518,7 +520,6 @@ filt_fiforead(struct knote *kn, long hin
struct socket *so = kn->kn_hook;
int rv;
- soassertlocked(so);
MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
kn->kn_data = so->so_rcv.sb_cc;
@@ -574,7 +575,6 @@ filt_fifoexcept(struct knote *kn, long h
struct socket *so = kn->kn_hook;
int rv = 0;
- soassertlocked(so);
MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
if (kn->kn_flags & __EV_POLL) {
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.125
diff -u -p -r1.125 socketvar.h
--- sys/sys/socketvar.h 22 Mar 2024 17:34:11 -0000 1.125
+++ sys/sys/socketvar.h 23 Mar 2024 18:59:11 -0000
@@ -242,7 +242,10 @@ sb_notify(struct socket *so, struct sock
static inline long
sbspace(struct socket *so, struct sockbuf *sb)
{
- soassertlocked_readonly(so);
+ if (sb->sb_flags & SB_MTXLOCK)
+ sbmtxassertlocked(so, sb);
+ else
+ soassertlocked_readonly(so);
return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}
Use `sb_mtx' to protect `so_rcv' receive buffer of unix(4) sockets