Download raw body.
Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
On Fri, May 03, 2024 at 12:26:32AM +0200, Alexander Bluhm wrote:
> On Wed, May 01, 2024 at 01:01:22AM +0300, Vitaliy Makkoveev wrote:
> > On Tue, Apr 30, 2024 at 11:01:04PM +0300, Vitaliy Makkoveev wrote:
> > > Use `sb_mtx' mutex(9) and `sb_lock' rwlock(9) to protect `so_snd' and
> > > `so_rcv' of unix(4) sockets.
> > >
> > > The transmission of unix(4) sockets already half-unlocked because
> > > connected peer is not locked by solock() during sbappend*() call. Since
> > > the `so_snd' is protected by `sb_mtx' mutex(9) the re-locking is not
> > > required in uipc_rcvd() too.
> > >
> > > SB_OWNLOCK became redundant with SB_MTXLOCK, so remove it. SB_MTXLOCK
> > > was kept because checks against SB_MTXLOCK within sb*() routines look
> > > more consistent to me.
> > >
> > > Please note, the unlocked peer `so2' of unix(4) can't be disconnected
> > > while solock() is held on `so'. That's why unlocked sorwakeup() and
> > > sowwakeup() are fine, corresponding paths will never be followed.
> > >
> >
> > Add forgotten fifofs chunks. Against previous, this diff does direct
> > `so_rcv' dispose and cleanup in sofree(). This sockets is almost dead
> > and unlinked from everywhere include spliced peer. So concurrent
> > sotask() thread will just exit. This required to solve inode and so_rcv
> > locks ordering. Also this removes re-locking from sofree() for all
> > sockets.
>
> Passed regress with witness.
>
> One question, otherwise OK bluhm@
>
> [skip]
>
> You change the order of EPIPE and ENOTCONN error checks here.
> The difference is relevant, EPIPE may signal a SIGPIPE.
>
> When writing into a socket that stops working, the error should be
> EPIPE. If there are two processes, A is writing to B, and B
> disconnects the socket, then A should get a EPIPE.
>
> Do you change the behavior?
>
> In sosend() we have a SS_CANTSENDMORE check with EPIPE error.
> So maybe the uipc_send() errors are irrelevant.
>
Yes, I change behavior. I had concerns , so I checked FreeBSD version of
uipc_send() and found that they do "so->so_state & SS_ISCONNECTED" check
before "so->so_snd.sb_state & SBS_CANTSENDMORE". But sosend_generic()
has the different order. Since this code exists for a long time, I found
this order not significant.
However, our fifofs is the only place where we do direct SS_CANTSENDMORE
flag modifications. We could keep solock() around them for serialization
with uipc_send() and keep existing checks order.
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.332 uipc_socket.c
--- sys/kern/uipc_socket.c 2 May 2024 11:55:31 -0000 1.332
+++ sys/kern/uipc_socket.c 2 May 2024 22:43:12 -0000
@@ -159,14 +159,15 @@ soalloc(const struct protosw *prp, int w
case AF_INET6:
switch (prp->pr_type) {
case SOCK_RAW:
- so->so_snd.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+ so->so_snd.sb_flags |= SB_MTXLOCK;
/* FALLTHROUGH */
case SOCK_DGRAM:
- so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
+ so->so_rcv.sb_flags |= SB_MTXLOCK;
break;
}
break;
case AF_UNIX:
+ so->so_snd.sb_flags |= SB_MTXLOCK;
so->so_rcv.sb_flags |= SB_MTXLOCK;
break;
}
@@ -354,18 +355,16 @@ sofree(struct socket *so, int keep_lock)
mtx_leave(&so->so_snd.sb_mtx);
/*
- * Regardless on '_locked' postfix, must release solock() before
- * call sorflush_locked() for SB_OWNLOCK marked socket. Can't
- * release solock() and call sorflush() because solock() release
- * is unwanted for tcp(4) socket.
+ * Unlocked dispose and cleanup is safe. Socket is unlinked
+ * from everywhere. Even concurrent sotask() thread will not
+ * call somove().
*/
+ if (so->so_proto->pr_flags & PR_RIGHTS &&
+ so->so_proto->pr_domain->dom_dispose)
+ (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+ m_purge(so->so_rcv.sb_mb);
- if (so->so_rcv.sb_flags & SB_OWNLOCK)
- sounlock(so);
-
- sorflush_locked(so);
-
- if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock))
+ if (!keep_lock)
sounlock(so);
#ifdef SOCKET_SPLICE
@@ -574,7 +573,7 @@ sosend(struct socket *so, struct mbuf *a
size_t resid;
int error;
int atomic = sosendallatonce(so) || top;
- int dosolock = ((so->so_snd.sb_flags & SB_OWNLOCK) == 0);
+ int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0);
if (uio)
resid = uio->uio_resid;
@@ -846,7 +845,7 @@ soreceive(struct socket *so, struct mbuf
const struct protosw *pr = so->so_proto;
struct mbuf *nextrecord;
size_t resid, orig_resid = uio->uio_resid;
- int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0);
+ int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0);
mp = mp0;
if (paddr)
@@ -945,7 +944,7 @@ restart:
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
- if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) {
+ if (so->so_rcv.sb_flags & SB_MTXLOCK) {
sbunlock_locked(so, &so->so_rcv);
if (dosolock)
sounlock_shared(so);
@@ -1247,7 +1246,11 @@ dontblock:
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
if (pr->pr_flags & PR_WANTRCVD) {
sb_mtx_unlock(&so->so_rcv);
+ if (!dosolock)
+ solock_shared(so);
pru_rcvd(so);
+ if (!dosolock)
+ sounlock_shared(so);
sb_mtx_lock(&so->so_rcv);
}
}
@@ -1306,17 +1309,17 @@ sorflush_locked(struct socket *so)
const struct protosw *pr = so->so_proto;
int error;
- if ((sb->sb_flags & SB_OWNLOCK) == 0)
+ if ((sb->sb_flags & SB_MTXLOCK) == 0)
soassertlocked(so);
error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
KASSERT(error == 0);
- if (sb->sb_flags & SB_OWNLOCK)
+ if (sb->sb_flags & SB_MTXLOCK)
solock(so);
socantrcvmore(so);
- if (sb->sb_flags & SB_OWNLOCK)
+ if (sb->sb_flags & SB_MTXLOCK)
sounlock(so);
mtx_enter(&sb->sb_mtx);
@@ -1334,10 +1337,10 @@ sorflush_locked(struct socket *so)
void
sorflush(struct socket *so)
{
- if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+ if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
solock_shared(so);
sorflush_locked(so);
- if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0)
+ if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
sounlock_shared(so);
}
@@ -1383,7 +1386,7 @@ sosplice(struct socket *so, int fd, off_
membar_consumer();
}
- if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+ if (so->so_rcv.sb_flags & SB_MTXLOCK) {
if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
return (error);
solock(so);
@@ -1471,7 +1474,7 @@ sosplice(struct socket *so, int fd, off_
release:
sbunlock(sosp, &sosp->so_snd);
out:
- if (so->so_rcv.sb_flags & SB_OWNLOCK) {
+ if (so->so_rcv.sb_flags & SB_MTXLOCK) {
sounlock(so);
sbunlock(so, &so->so_rcv);
} else {
@@ -1885,7 +1888,8 @@ sorwakeup(struct socket *so)
void
sowwakeup(struct socket *so)
{
- soassertlocked_readonly(so);
+ if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
+ soassertlocked_readonly(so);
#ifdef SOCKET_SPLICE
if (so->so_snd.sb_flags & SB_SPLICE)
@@ -1976,7 +1980,7 @@ sosetopt(struct socket *so, int level, i
if ((long)cnt <= 0)
cnt = 1;
- if (((sb->sb_flags & SB_OWNLOCK) == 0))
+ if (((sb->sb_flags & SB_MTXLOCK) == 0))
solock(so);
mtx_enter(&sb->sb_mtx);
@@ -2003,7 +2007,7 @@ sosetopt(struct socket *so, int level, i
}
mtx_leave(&sb->sb_mtx);
- if (((sb->sb_flags & SB_OWNLOCK) == 0))
+ if (((sb->sb_flags & SB_MTXLOCK) == 0))
sounlock(so);
break;
@@ -2380,7 +2384,8 @@ filt_sowrite(struct knote *kn, long hint
int rv;
MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
- soassertlocked_readonly(so);
+ if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
+ soassertlocked_readonly(so);
kn->kn_data = sbspace(so, &so->so_snd);
if (so->so_snd.sb_state & SS_CANTSENDMORE) {
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
diff -u -p -r1.152 uipc_socket2.c
--- sys/kern/uipc_socket2.c 2 May 2024 21:26:52 -0000 1.152
+++ sys/kern/uipc_socket2.c 2 May 2024 22:43:12 -0000
@@ -228,9 +228,10 @@ sonewconn(struct socket *head, int conns
*/
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat))
goto fail;
+
+ mtx_enter(&head->so_snd.sb_mtx);
so->so_snd.sb_wat = head->so_snd.sb_wat;
so->so_snd.sb_lowat = head->so_snd.sb_lowat;
- mtx_enter(&head->so_snd.sb_mtx);
so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
mtx_leave(&head->so_snd.sb_mtx);
@@ -543,7 +544,7 @@ sblock(struct socket *so, struct sockbuf
{
int error = 0, prio = PSOCK;
- if (sb->sb_flags & SB_OWNLOCK) {
+ if (sb->sb_flags & SB_MTXLOCK) {
int rwflags = RW_WRITE;
if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
@@ -586,7 +587,7 @@ out:
void
sbunlock_locked(struct socket *so, struct sockbuf *sb)
{
- if (sb->sb_flags & SB_OWNLOCK) {
+ if (sb->sb_flags & SB_MTXLOCK) {
rw_exit(&sb->sb_lock);
return;
}
@@ -603,7 +604,7 @@ sbunlock_locked(struct socket *so, struc
void
sbunlock(struct socket *so, struct sockbuf *sb)
{
- if (sb->sb_flags & SB_OWNLOCK) {
+ if (sb->sb_flags & SB_MTXLOCK) {
rw_exit(&sb->sb_lock);
return;
}
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
diff -u -p -r1.205 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 2 May 2024 17:10:55 -0000 1.205
+++ sys/kern/uipc_usrreq.c 2 May 2024 22:43:12 -0000
@@ -477,20 +477,24 @@ uipc_dgram_shutdown(struct socket *so)
void
uipc_rcvd(struct socket *so)
{
+ struct unpcb *unp = sotounpcb(so);
struct socket *so2;
- if ((so2 = unp_solock_peer(so)) == NULL)
+ if (unp->unp_conn == NULL)
return;
+ so2 = unp->unp_conn->unp_socket;
+
/*
* Adjust backpressure on sender
* and wakeup any waiting to write.
*/
mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&so2->so_snd.sb_mtx);
so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
so2->so_snd.sb_cc = so->so_rcv.sb_cc;
+ mtx_leave(&so2->so_snd.sb_mtx);
mtx_leave(&so->so_rcv.sb_mtx);
sowwakeup(so2);
- sounlock(so2);
}
int
@@ -525,11 +529,17 @@ uipc_send(struct socket *so, struct mbuf
* send buffer counts to maintain backpressure.
* Wake up readers.
*/
+ /*
+ * sbappend*() should be serialized together
+ * with so_snd modification.
+ */
mtx_enter(&so2->so_rcv.sb_mtx);
+ mtx_enter(&so->so_snd.sb_mtx);
if (control) {
if (sbappendcontrol(so2, &so2->so_rcv, m, control)) {
control = NULL;
} else {
+ mtx_leave(&so->so_snd.sb_mtx);
mtx_leave(&so2->so_rcv.sb_mtx);
error = ENOBUFS;
goto dispose;
@@ -542,6 +552,7 @@ uipc_send(struct socket *so, struct mbuf
so->so_snd.sb_cc = so2->so_rcv.sb_cc;
if (so2->so_rcv.sb_cc > 0)
dowakeup = 1;
+ mtx_leave(&so->so_snd.sb_mtx);
mtx_leave(&so2->so_rcv.sb_mtx);
if (dowakeup)
Index: sys/miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
diff -u -p -r1.104 fifo_vnops.c
--- sys/miscfs/fifofs/fifo_vnops.c 26 Mar 2024 09:46:47 -0000 1.104
+++ sys/miscfs/fifofs/fifo_vnops.c 2 May 2024 22:43:12 -0000
@@ -174,9 +174,12 @@ fifo_open(void *v)
return (error);
}
fip->fi_readers = fip->fi_writers = 0;
+ /* Take solock() to serialize with uipc_send() */
solock(wso);
+ mtx_enter(&wso->so_snd.sb_mtx);
wso->so_snd.sb_state |= SS_CANTSENDMORE;
wso->so_snd.sb_lowat = PIPE_BUF;
+ mtx_leave(&wso->so_snd.sb_mtx);
sounlock(wso);
} else {
rso = fip->fi_readsock;
@@ -185,8 +188,11 @@ fifo_open(void *v)
if (ap->a_mode & FREAD) {
fip->fi_readers++;
if (fip->fi_readers == 1) {
+ /* Take solock() to serialize with uipc_send() */
solock(wso);
+ mtx_enter(&wso->so_snd.sb_mtx);
wso->so_snd.sb_state &= ~SS_CANTSENDMORE;
+ mtx_leave(&wso->so_snd.sb_mtx);
sounlock(wso);
if (fip->fi_writers > 0)
wakeup(&fip->fi_writers);
@@ -554,7 +560,6 @@ filt_fifowrite(struct knote *kn, long hi
struct socket *so = kn->kn_hook;
int rv;
- soassertlocked(so);
MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
kn->kn_data = sbspace(so, &so->so_snd);
@@ -625,11 +630,9 @@ filt_fifowmodify(struct kevent *kev, str
struct socket *so = kn->kn_hook;
int rv;
- solock(so);
mtx_enter(&so->so_snd.sb_mtx);
rv = knote_modify(kev, kn);
mtx_leave(&so->so_snd.sb_mtx);
- sounlock(so);
return (rv);
}
@@ -640,11 +643,9 @@ filt_fifowprocess(struct knote *kn, stru
struct socket *so = kn->kn_hook;
int rv;
- solock(so);
mtx_enter(&so->so_snd.sb_mtx);
rv = knote_process(kn, kev);
mtx_leave(&so->so_snd.sb_mtx);
- sounlock(so);
return (rv);
}
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.129 socketvar.h
--- sys/sys/socketvar.h 11 Apr 2024 13:32:51 -0000 1.129
+++ sys/sys/socketvar.h 2 May 2024 22:43:12 -0000
@@ -134,8 +134,7 @@ struct socket {
#define SB_ASYNC 0x0010 /* ASYNC I/O, need signals */
#define SB_SPLICE 0x0020 /* buffer is splice source or drain */
#define SB_NOINTR 0x0040 /* operations not interruptible */
-#define SB_MTXLOCK 0x0080 /* use sb_mtx for sockbuf protection */
-#define SB_OWNLOCK 0x0100 /* sblock() doesn't need solock() */
+#define SB_MTXLOCK 0x0080 /* sblock() doesn't need solock() */
void (*so_upcall)(struct socket *so, caddr_t arg, int waitf);
caddr_t so_upcallarg; /* Arg for above */
Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
Don't take solock() in sosend() and soreceive() paths for unix(4) sockets
Don't take solock() in sosend() and soreceive() paths for unix(4) sockets