Download raw body.
Use `sb_mtx' mutex(9) for the `sb_flags' protection
On Tue, Feb 13, 2024 at 02:05:22PM +0100, Alexander Bluhm wrote:
> On Tue, Feb 13, 2024 at 03:37:40PM +0300, Vitaliy Makkoveev wrote:
> > I want to do this before go forward.
> >
> > Temporary, sockbuf structure members modification relies on `sb_mtx'
> > mutex(9) only while shared solock() held, meanwhile the rest relies on
> > exclusive solock(). Make the `sb_flags' protection consistent as it
> > already done for `sb_timeo_nsecs' and `sb_klist'.
> >
> > The SB_SPLICE flag modification rests the special case. The `sb_mtx'
> > mutex(9) only keeps `sb_flags' consistent while SB_SPLICE flag modifying.
> > The socket splice paths serialization still rests with netlock.
>
> Is is a good idea to rip apart the atomicity of so_rcv.sb_flags and
> so_snd.sb_flags? Either the socket is async or it is not. Now you
> have sockets that are async on the reviece side, but not on the
> send side.
>
Is this the problem? The IO operations on `so_snd' and `so_rcv' are
asynchronous include the wakeup and SIGIO delivery. Also, we have
SB_ASYNC for each sockbuf, but no SO_ASYNC or SS_ASYNC for the whole
socket. `so_sigio' is common for the whole socket, but the flags are
not.
Also, there is no problem to take both mutexes together before modify
SS_ASYNC.
> Also for socket splicing so->so_rcv.sb_flags and sosp->so_snd.sb_flags
> should be consistent. For now, exclusive net lock keeps things
> together. But when you introduce a mutex it should not only look
> at individual fields, but also the context.
>
They still are. The SB_SPLICE modification on both `so' and `sosp' still
requires exclusive netlock to be take first before SB_SPLICE flag
modification and all other read-only SB_SPLICE check paths.
> > The SB_SPLICE flag modification rests the special case. The `sb_mtx'
> > mutex(9) only keeps `sb_flags' consistent while SB_SPLICE flag modifying.
> > The socket splice paths serialization still rests with netlock.
The paths sosplice(), somove(), sounsplice(), etc are netlock
serialized, but the `sb_flags' modification outside requires `sb_mtx'.
Outside and inside netlock protected section.
> short sb_flags and short sb_state live in the same integer. I am
> not sure if we may have different locking strategies for them. At
> least the alpha CPU may modify one of them with a common 32 access.
>
This is the problem. So, I propose to use `sb_mtx' for `sb_state'
modification too.
Index: sys/kern/sys_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.61
diff -u -p -r1.61 sys_socket.c
--- sys/kern/sys_socket.c 15 Apr 2023 13:18:28 -0000 1.61
+++ sys/kern/sys_socket.c 13 Feb 2024 14:06:33 -0000
@@ -90,7 +90,8 @@ soo_ioctl(struct file *fp, u_long cmd, c
break;
case FIOASYNC:
- solock(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&so->so_snd.sb_mtx);
if (*(int *)data) {
so->so_rcv.sb_flags |= SB_ASYNC;
so->so_snd.sb_flags |= SB_ASYNC;
@@ -98,7 +99,8 @@ soo_ioctl(struct file *fp, u_long cmd, c
so->so_rcv.sb_flags &= ~SB_ASYNC;
so->so_snd.sb_flags &= ~SB_ASYNC;
}
- sounlock(so);
+ mtx_leave(&so->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
break;
case FIONREAD:
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.320
diff -u -p -r1.320 uipc_socket.c
--- sys/kern/uipc_socket.c 12 Feb 2024 22:48:27 -0000 1.320
+++ sys/kern/uipc_socket.c 13 Feb 2024 14:06:33 -0000
@@ -143,8 +143,8 @@ soalloc(const struct protosw *prp, int w
return (NULL);
rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
refcnt_init(&so->so_refcnt);
- mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
- mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
+ mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", MTX_DUPOK);
+ mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", MTX_DUPOK);
klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
sigio_init(&so->so_sigio);
@@ -587,7 +587,9 @@ sosend(struct socket *so, struct mbuf *a
restart:
if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
goto out;
+ mtx_enter(&so->so_snd.sb_mtx);
so->so_snd.sb_state |= SS_ISSENDING;
+ mtx_leave(&so->so_snd.sb_mtx);
do {
if (so->so_snd.sb_state & SS_CANTSENDMORE)
snderr(EPIPE);
@@ -621,7 +623,9 @@ restart:
snderr(EWOULDBLOCK);
sbunlock(so, &so->so_snd);
error = sbwait(so, &so->so_snd);
+ mtx_enter(&so->so_snd.sb_mtx);
so->so_snd.sb_state &= ~SS_ISSENDING;
+ mtx_leave(&so->so_snd.sb_mtx);
if (error)
goto out;
goto restart;
@@ -663,7 +667,9 @@ restart:
} while (resid);
release:
+ mtx_enter(&so->so_snd.sb_mtx);
so->so_snd.sb_state &= ~SS_ISSENDING;
+ mtx_leave(&so->so_snd.sb_mtx);
sbunlock(so, &so->so_snd);
out:
sounlock_shared(so);
@@ -838,7 +844,7 @@ restart:
sounlock_shared(so);
return (error);
}
- sb_mtx_lock(&so->so_rcv);
+ mtx_enter(&so->so_rcv.sb_mtx);
m = so->so_rcv.sb_mb;
#ifdef SOCKET_SPLICE
@@ -901,7 +907,7 @@ restart:
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
- sb_mtx_unlock(&so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
sbunlock(so, &so->so_rcv);
error = sbwait(so, &so->so_rcv);
if (error) {
@@ -974,13 +980,13 @@ dontblock:
sbsync(&so->so_rcv, nextrecord);
if (controlp) {
if (pr->pr_domain->dom_externalize) {
- sb_mtx_unlock(&so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
sounlock_shared(so);
error =
(*pr->pr_domain->dom_externalize)
(cm, controllen, flags);
solock_shared(so);
- sb_mtx_lock(&so->so_rcv);
+ mtx_enter(&so->so_rcv.sb_mtx);
}
*controlp = cm;
} else {
@@ -988,8 +994,11 @@ dontblock:
* Dispose of any SCM_RIGHTS message that went
* through the read path rather than recv.
*/
- if (pr->pr_domain->dom_dispose)
+ if (pr->pr_domain->dom_dispose) {
+ mtx_leave(&so->so_rcv.sb_mtx);
pr->pr_domain->dom_dispose(cm);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ }
m_free(cm);
}
}
@@ -1054,11 +1063,11 @@ dontblock:
SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
resid = uio->uio_resid;
- sb_mtx_unlock(&so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
sounlock_shared(so);
uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
solock_shared(so);
- sb_mtx_lock(&so->so_rcv);
+ mtx_enter(&so->so_rcv.sb_mtx);
if (uio_error)
uio->uio_resid = resid - len;
} else
@@ -1140,14 +1149,14 @@ dontblock:
break;
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
- sb_mtx_unlock(&so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
error = sbwait(so, &so->so_rcv);
if (error) {
sbunlock(so, &so->so_rcv);
sounlock_shared(so);
return (0);
}
- sb_mtx_lock(&so->so_rcv);
+ mtx_enter(&so->so_rcv.sb_mtx);
if ((m = so->so_rcv.sb_mb) != NULL)
nextrecord = m->m_nextpkt;
}
@@ -1174,13 +1183,16 @@ dontblock:
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
- if (pr->pr_flags & PR_WANTRCVD)
+ if (pr->pr_flags & PR_WANTRCVD) {
+ mtx_leave(&so->so_rcv.sb_mtx);
pru_rcvd(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ }
}
if (orig_resid == uio->uio_resid && orig_resid &&
(flags & MSG_EOR) == 0 &&
(so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
- sb_mtx_unlock(&so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
sbunlock(so, &so->so_rcv);
goto restart;
}
@@ -1191,7 +1203,7 @@ dontblock:
if (flagsp)
*flagsp |= flags;
release:
- sb_mtx_unlock(&so->so_rcv);
+ mtx_leave(&so->so_rcv.sb_mtx);
sbunlock(so, &so->so_rcv);
sounlock_shared(so);
return (error);
@@ -1373,8 +1385,13 @@ sosplice(struct socket *so, int fd, off_
* we sleep, the socket buffers are not marked as spliced yet.
*/
if (somove(so, M_WAIT)) {
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_flags |= SB_SPLICE;
+ mtx_leave(&so->so_rcv.sb_mtx);
+
+ mtx_enter(&sosp->so_snd.sb_mtx);
sosp->so_snd.sb_flags |= SB_SPLICE;
+ mtx_leave(&sosp->so_snd.sb_mtx);
}
release:
@@ -1400,8 +1417,15 @@ sounsplice(struct socket *so, struct soc
task_del(sosplice_taskq, &so->so_splicetask);
timeout_del(&so->so_idleto);
+
+ mtx_enter(&sosp->so_snd.sb_mtx);
sosp->so_snd.sb_flags &= ~SB_SPLICE;
+ mtx_leave(&sosp->so_snd.sb_mtx);
+
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_flags &= ~SB_SPLICE;
+ mtx_leave(&so->so_rcv.sb_mtx);
+
so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
/* Do not wakeup a socket that is about to be freed. */
if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.144
diff -u -p -r1.144 uipc_socket2.c
--- sys/kern/uipc_socket2.c 12 Feb 2024 22:48:27 -0000 1.144
+++ sys/kern/uipc_socket2.c 13 Feb 2024 14:06:33 -0000
@@ -140,26 +140,40 @@ void
soisdisconnecting(struct socket *so)
{
soassertlocked(so);
+
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= SS_ISDISCONNECTING;
+ wakeup(&so->so_timeo);
+
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
+ mtx_leave(&so->so_rcv.sb_mtx);
+ sorwakeup(so);
+
+ mtx_enter(&so->so_snd.sb_mtx);
so->so_snd.sb_state |= SS_CANTSENDMORE;
- wakeup(&so->so_timeo);
+ mtx_leave(&so->so_snd.sb_mtx);
sowwakeup(so);
- sorwakeup(so);
}
void
soisdisconnected(struct socket *so)
{
soassertlocked(so);
+
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISDISCONNECTED;
+ wakeup(&so->so_timeo);
+
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_state |= SS_CANTRCVMORE;
+ mtx_leave(&so->so_rcv.sb_mtx);
+ sorwakeup(so);
+
+ mtx_enter(&so->so_snd.sb_mtx);
so->so_snd.sb_state |= SS_CANTSENDMORE;
- wakeup(&so->so_timeo);
+ mtx_leave(&so->so_snd.sb_mtx);
sowwakeup(so);
- sorwakeup(so);
}
/*
@@ -521,12 +535,13 @@ int
sbwait(struct socket *so, struct sockbuf *sb)
{
uint64_t timeo_nsecs;
- int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
+ int prio;
soassertlocked(so);
mtx_enter(&sb->sb_mtx);
timeo_nsecs = sb->sb_timeo_nsecs;
+ prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
sb->sb_flags |= SB_WAIT;
mtx_leave(&sb->sb_mtx);
Index: sys/miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.103
diff -u -p -r1.103 fifo_vnops.c
--- sys/miscfs/fifofs/fifo_vnops.c 3 Feb 2024 22:50:09 -0000 1.103
+++ sys/miscfs/fifofs/fifo_vnops.c 13 Feb 2024 14:06:33 -0000
@@ -175,8 +175,10 @@ fifo_open(void *v)
}
fip->fi_readers = fip->fi_writers = 0;
solock(wso);
+ mtx_enter(&wso->so_snd.sb_mtx);
wso->so_snd.sb_state |= SS_CANTSENDMORE;
wso->so_snd.sb_lowat = PIPE_BUF;
+ mtx_leave(&wso->so_snd.sb_mtx);
sounlock(wso);
} else {
rso = fip->fi_readsock;
@@ -186,7 +188,9 @@ fifo_open(void *v)
fip->fi_readers++;
if (fip->fi_readers == 1) {
solock(wso);
+ mtx_enter(&wso->so_snd.sb_mtx);
wso->so_snd.sb_state &= ~SS_CANTSENDMORE;
+ mtx_leave(&wso->so_snd.sb_mtx);
sounlock(wso);
if (fip->fi_writers > 0)
wakeup(&fip->fi_writers);
@@ -201,7 +205,9 @@ fifo_open(void *v)
if (fip->fi_writers == 1) {
solock(rso);
rso->so_state &= ~SS_ISDISCONNECTED;
+ mtx_enter(&rso->so_rcv.sb_mtx);
rso->so_rcv.sb_state &= ~SS_CANTRCVMORE;
+ mtx_leave(&rso->so_rcv.sb_mtx);
sounlock(rso);
if (fip->fi_readers > 0)
wakeup(&fip->fi_readers);
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.400
diff -u -p -r1.400 tcp_input.c
--- sys/netinet/tcp_input.c 11 Feb 2024 01:27:45 -0000 1.400
+++ sys/netinet/tcp_input.c 13 Feb 2024 14:06:33 -0000
@@ -1898,10 +1898,12 @@ step6:
*/
if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
tp->rcv_up = th->th_seq + th->th_urp;
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_oobmark = so->so_rcv.sb_cc +
(tp->rcv_up - tp->rcv_nxt) - 1;
if (so->so_oobmark == 0)
so->so_rcv.sb_state |= SS_RCVATMARK;
+ mtx_leave(&so->so_rcv.sb_mtx);
sohasoutofband(so);
tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
}
Index: sys/nfs/nfs_socket.c
===================================================================
RCS file: /cvs/src/sys/nfs/nfs_socket.c,v
retrieving revision 1.145
diff -u -p -r1.145 nfs_socket.c
--- sys/nfs/nfs_socket.c 5 Feb 2024 20:21:39 -0000 1.145
+++ sys/nfs/nfs_socket.c 13 Feb 2024 14:06:33 -0000
@@ -369,11 +369,17 @@ nfs_connect(struct nfsmount *nmp, struct
}
solock(so);
error = soreserve(so, sndreserve, rcvreserve);
+ sounlock(so);
if (error)
- goto bad_locked;
+ goto bad;
+
+ mtx_enter(&so->so_rcv.sb_mtx);
so->so_rcv.sb_flags |= SB_NOINTR;
+ mtx_leave(&so->so_rcv.sb_mtx);
+
+ mtx_enter(&so->so_snd.sb_mtx);
so->so_snd.sb_flags |= SB_NOINTR;
- sounlock(so);
+ mtx_leave(&so->so_snd.sb_mtx);
m_freem(mopt);
m_freem(nam);
Index: sys/nfs/nfs_syscalls.c
===================================================================
RCS file: /cvs/src/sys/nfs/nfs_syscalls.c,v
retrieving revision 1.121
diff -u -p -r1.121 nfs_syscalls.c
--- sys/nfs/nfs_syscalls.c 5 Feb 2024 20:21:39 -0000 1.121
+++ sys/nfs/nfs_syscalls.c 13 Feb 2024 14:06:33 -0000
@@ -275,16 +275,17 @@ nfssvc_addsock(struct file *fp, struct m
sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m);
m_freem(m);
}
- solock(so);
- so->so_rcv.sb_flags &= ~SB_NOINTR;
+
mtx_enter(&so->so_rcv.sb_mtx);
+ so->so_rcv.sb_flags &= ~SB_NOINTR;
so->so_rcv.sb_timeo_nsecs = INFSLP;
mtx_leave(&so->so_rcv.sb_mtx);
- so->so_snd.sb_flags &= ~SB_NOINTR;
+
mtx_enter(&so->so_snd.sb_mtx);
+ so->so_snd.sb_flags &= ~SB_NOINTR;
so->so_snd.sb_timeo_nsecs = INFSLP;
mtx_leave(&so->so_snd.sb_mtx);
- sounlock(so);
+
if (tslp)
slp = tslp;
else {
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.124
diff -u -p -r1.124 socketvar.h
--- sys/sys/socketvar.h 12 Feb 2024 22:48:27 -0000 1.124
+++ sys/sys/socketvar.h 13 Feb 2024 14:06:34 -0000
@@ -49,6 +49,11 @@
typedef __socklen_t socklen_t; /* length type for network syscalls */
#endif
+/*
+ * Locks used to protect struct members:
+ * m sb_mtx
+ */
+
TAILQ_HEAD(soqhead, socket);
/*
@@ -120,12 +125,12 @@ struct socket {
struct mbuf *sb_mbtail; /* the last mbuf in the chain */
struct mbuf *sb_lastrecord;/* first mbuf of last record in
socket buffer */
- short sb_flags; /* flags, see below */
+ short sb_flags; /* [m] flags, see below */
/* End area that is zeroed on flush. */
#define sb_endzero sb_flags
short sb_state; /* socket state on sockbuf */
- uint64_t sb_timeo_nsecs;/* timeout for read/write */
- struct klist sb_klist; /* process selecting read/write */
+ uint64_t sb_timeo_nsecs;/* [m] timeout for read/write */
+ struct klist sb_klist; /* [m] process selecting read/write */
} so_rcv, so_snd;
#define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */
#define SB_LOCK 0x01 /* lock on data queue */
@@ -221,8 +226,6 @@ static inline int
sb_notify(struct socket *so, struct sockbuf *sb)
{
int rv;
-
- soassertlocked(so);
mtx_enter(&sb->sb_mtx);
rv = ((sb->sb_flags & (SB_WAIT|SB_ASYNC|SB_SPLICE)) != 0 ||
Use `sb_mtx' mutex(9) for the `sb_flags' protection