From: Vitaliy Makkoveev Subject: Re: Use `sb_mtx' mutex(9) for the `sb_flags' protection To: Alexander Bluhm Cc: tech@openbsd.org Date: Tue, 13 Feb 2024 17:11:12 +0300 On Tue, Feb 13, 2024 at 02:05:22PM +0100, Alexander Bluhm wrote: > On Tue, Feb 13, 2024 at 03:37:40PM +0300, Vitaliy Makkoveev wrote: > > I want to do this before go forward. > > > > Temporary, sockbuf structure members modification relies on `sb_mtx' > > mutex(9) only while shared solock() held, meanwhile the rest relies on > > exclusive solock(). Make the `sb_flags' protection consistent as it > > already done for `sb_timeo_nsecs' and `sb_klist'. > > > > The SB_SPLICE flag modification rests the special case. The `sb_mtx' > > mutex(9) only keeps `sb_flags' consistent while SB_SPLICE flag modifying. > > The socket splice paths serialization still rests with netlock. > > Is is a good idea to rip apart the atomicity of so_rcv.sb_flags and > so_snd.sb_flags? Either the socket is async or it is not. Now you > have sockets that are async on the reviece side, but not on the > send side. > Is this the problem? The IO operations on `so_snd' and `so_rcv' are asynchronous include the wakeup and SIGIO delivery. Also, we have SB_ASYNC for each sockbuf, but no SO_ASYNC or SS_ASYNC for the whole socket. `so_sigio' is common for the whole socket, but the flags are not. Also, there is no problem to take both mutexes together before modify SS_ASYNC. > Also for socket splicing so->so_rcv.sb_flags and sosp->so_snd.sb_flags > should be consistent. For now, exclusive net lock keeps things > together. But when you introduce a mutex it should not only look > at individual fields, but also the context. > They still are. The SB_SPLICE modification on both `so' and `sosp' still requires exclusive netlock to be take first before SB_SPLICE flag modification and all other read-only SB_SPLICE check paths. > > The SB_SPLICE flag modification rests the special case. The `sb_mtx' > > mutex(9) only keeps `sb_flags' consistent while SB_SPLICE flag modifying. > > The socket splice paths serialization still rests with netlock. The paths sosplice(), somove(), sounsplice(), etc are netlock serialized, but the `sb_flags' modification outside requires `sb_mtx'. Outside and inside netlock protected section. > short sb_flags and short sb_state live in the same integer. I am > not sure if we may have different locking strategies for them. At > least the alpha CPU may modify one of them with a common 32 access. > This is the problem. So, I propose to use `sb_mtx' for `sb_state' modification too. Index: sys/kern/sys_socket.c =================================================================== RCS file: /cvs/src/sys/kern/sys_socket.c,v retrieving revision 1.61 diff -u -p -r1.61 sys_socket.c --- sys/kern/sys_socket.c 15 Apr 2023 13:18:28 -0000 1.61 +++ sys/kern/sys_socket.c 13 Feb 2024 14:06:33 -0000 @@ -90,7 +90,8 @@ soo_ioctl(struct file *fp, u_long cmd, c break; case FIOASYNC: - solock(so); + mtx_enter(&so->so_rcv.sb_mtx); + mtx_enter(&so->so_snd.sb_mtx); if (*(int *)data) { so->so_rcv.sb_flags |= SB_ASYNC; so->so_snd.sb_flags |= SB_ASYNC; @@ -98,7 +99,8 @@ soo_ioctl(struct file *fp, u_long cmd, c so->so_rcv.sb_flags &= ~SB_ASYNC; so->so_snd.sb_flags &= ~SB_ASYNC; } - sounlock(so); + mtx_leave(&so->so_snd.sb_mtx); + mtx_leave(&so->so_rcv.sb_mtx); break; case FIONREAD: Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.320 diff -u -p -r1.320 uipc_socket.c --- sys/kern/uipc_socket.c 12 Feb 2024 22:48:27 -0000 1.320 +++ sys/kern/uipc_socket.c 13 Feb 2024 14:06:33 -0000 @@ -143,8 +143,8 @@ soalloc(const struct protosw *prp, int w return (NULL); rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); refcnt_init(&so->so_refcnt); - mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR); - mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR); + mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", MTX_DUPOK); + mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", MTX_DUPOK); klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); sigio_init(&so->so_sigio); @@ -587,7 +587,9 @@ sosend(struct socket *so, struct mbuf *a restart: if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state |= SS_ISSENDING; + mtx_leave(&so->so_snd.sb_mtx); do { if (so->so_snd.sb_state & SS_CANTSENDMORE) snderr(EPIPE); @@ -621,7 +623,9 @@ restart: snderr(EWOULDBLOCK); sbunlock(so, &so->so_snd); error = sbwait(so, &so->so_snd); + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state &= ~SS_ISSENDING; + mtx_leave(&so->so_snd.sb_mtx); if (error) goto out; goto restart; @@ -663,7 +667,9 @@ restart: } while (resid); release: + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state &= ~SS_ISSENDING; + mtx_leave(&so->so_snd.sb_mtx); sbunlock(so, &so->so_snd); out: sounlock_shared(so); @@ -838,7 +844,7 @@ restart: sounlock_shared(so); return (error); } - sb_mtx_lock(&so->so_rcv); + mtx_enter(&so->so_rcv.sb_mtx); m = so->so_rcv.sb_mb; #ifdef SOCKET_SPLICE @@ -901,7 +907,7 @@ restart: } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); - sb_mtx_unlock(&so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); sbunlock(so, &so->so_rcv); error = sbwait(so, &so->so_rcv); if (error) { @@ -974,13 +980,13 @@ dontblock: sbsync(&so->so_rcv, nextrecord); if (controlp) { if (pr->pr_domain->dom_externalize) { - sb_mtx_unlock(&so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); sounlock_shared(so); error = (*pr->pr_domain->dom_externalize) (cm, controllen, flags); solock_shared(so); - sb_mtx_lock(&so->so_rcv); + mtx_enter(&so->so_rcv.sb_mtx); } *controlp = cm; } else { @@ -988,8 +994,11 @@ dontblock: * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ - if (pr->pr_domain->dom_dispose) + if (pr->pr_domain->dom_dispose) { + mtx_leave(&so->so_rcv.sb_mtx); pr->pr_domain->dom_dispose(cm); + mtx_enter(&so->so_rcv.sb_mtx); + } m_free(cm); } } @@ -1054,11 +1063,11 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; - sb_mtx_unlock(&so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); sounlock_shared(so); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); solock_shared(so); - sb_mtx_lock(&so->so_rcv); + mtx_enter(&so->so_rcv.sb_mtx); if (uio_error) uio->uio_resid = resid - len; } else @@ -1140,14 +1149,14 @@ dontblock: break; SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - sb_mtx_unlock(&so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); error = sbwait(so, &so->so_rcv); if (error) { sbunlock(so, &so->so_rcv); sounlock_shared(so); return (0); } - sb_mtx_lock(&so->so_rcv); + mtx_enter(&so->so_rcv.sb_mtx); if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; } @@ -1174,13 +1183,16 @@ dontblock: } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); - if (pr->pr_flags & PR_WANTRCVD) + if (pr->pr_flags & PR_WANTRCVD) { + mtx_leave(&so->so_rcv.sb_mtx); pru_rcvd(so); + mtx_enter(&so->so_rcv.sb_mtx); + } } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { - sb_mtx_unlock(&so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); sbunlock(so, &so->so_rcv); goto restart; } @@ -1191,7 +1203,7 @@ dontblock: if (flagsp) *flagsp |= flags; release: - sb_mtx_unlock(&so->so_rcv); + mtx_leave(&so->so_rcv.sb_mtx); sbunlock(so, &so->so_rcv); sounlock_shared(so); return (error); @@ -1373,8 +1385,13 @@ sosplice(struct socket *so, int fd, off_ * we sleep, the socket buffers are not marked as spliced yet. */ if (somove(so, M_WAIT)) { + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_flags |= SB_SPLICE; + mtx_leave(&so->so_rcv.sb_mtx); + + mtx_enter(&sosp->so_snd.sb_mtx); sosp->so_snd.sb_flags |= SB_SPLICE; + mtx_leave(&sosp->so_snd.sb_mtx); } release: @@ -1400,8 +1417,15 @@ sounsplice(struct socket *so, struct soc task_del(sosplice_taskq, &so->so_splicetask); timeout_del(&so->so_idleto); + + mtx_enter(&sosp->so_snd.sb_mtx); sosp->so_snd.sb_flags &= ~SB_SPLICE; + mtx_leave(&sosp->so_snd.sb_mtx); + + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_flags &= ~SB_SPLICE; + mtx_leave(&so->so_rcv.sb_mtx); + so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; /* Do not wakeup a socket that is about to be freed. */ if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.144 diff -u -p -r1.144 uipc_socket2.c --- sys/kern/uipc_socket2.c 12 Feb 2024 22:48:27 -0000 1.144 +++ sys/kern/uipc_socket2.c 13 Feb 2024 14:06:33 -0000 @@ -140,26 +140,40 @@ void soisdisconnecting(struct socket *so) { soassertlocked(so); + so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; + wakeup(&so->so_timeo); + + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; + mtx_leave(&so->so_rcv.sb_mtx); + sorwakeup(so); + + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; - wakeup(&so->so_timeo); + mtx_leave(&so->so_snd.sb_mtx); sowwakeup(so); - sorwakeup(so); } void soisdisconnected(struct socket *so) { soassertlocked(so); + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; + wakeup(&so->so_timeo); + + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_state |= SS_CANTRCVMORE; + mtx_leave(&so->so_rcv.sb_mtx); + sorwakeup(so); + + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_state |= SS_CANTSENDMORE; - wakeup(&so->so_timeo); + mtx_leave(&so->so_snd.sb_mtx); sowwakeup(so); - sorwakeup(so); } /* @@ -521,12 +535,13 @@ int sbwait(struct socket *so, struct sockbuf *sb) { uint64_t timeo_nsecs; - int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; + int prio; soassertlocked(so); mtx_enter(&sb->sb_mtx); timeo_nsecs = sb->sb_timeo_nsecs; + prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; sb->sb_flags |= SB_WAIT; mtx_leave(&sb->sb_mtx); Index: sys/miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v retrieving revision 1.103 diff -u -p -r1.103 fifo_vnops.c --- sys/miscfs/fifofs/fifo_vnops.c 3 Feb 2024 22:50:09 -0000 1.103 +++ sys/miscfs/fifofs/fifo_vnops.c 13 Feb 2024 14:06:33 -0000 @@ -175,8 +175,10 @@ fifo_open(void *v) } fip->fi_readers = fip->fi_writers = 0; solock(wso); + mtx_enter(&wso->so_snd.sb_mtx); wso->so_snd.sb_state |= SS_CANTSENDMORE; wso->so_snd.sb_lowat = PIPE_BUF; + mtx_leave(&wso->so_snd.sb_mtx); sounlock(wso); } else { rso = fip->fi_readsock; @@ -186,7 +188,9 @@ fifo_open(void *v) fip->fi_readers++; if (fip->fi_readers == 1) { solock(wso); + mtx_enter(&wso->so_snd.sb_mtx); wso->so_snd.sb_state &= ~SS_CANTSENDMORE; + mtx_leave(&wso->so_snd.sb_mtx); sounlock(wso); if (fip->fi_writers > 0) wakeup(&fip->fi_writers); @@ -201,7 +205,9 @@ fifo_open(void *v) if (fip->fi_writers == 1) { solock(rso); rso->so_state &= ~SS_ISDISCONNECTED; + mtx_enter(&rso->so_rcv.sb_mtx); rso->so_rcv.sb_state &= ~SS_CANTRCVMORE; + mtx_leave(&rso->so_rcv.sb_mtx); sounlock(rso); if (fip->fi_readers > 0) wakeup(&fip->fi_readers); Index: sys/netinet/tcp_input.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.400 diff -u -p -r1.400 tcp_input.c --- sys/netinet/tcp_input.c 11 Feb 2024 01:27:45 -0000 1.400 +++ sys/netinet/tcp_input.c 13 Feb 2024 14:06:33 -0000 @@ -1898,10 +1898,12 @@ step6: */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; + mtx_enter(&so->so_rcv.sb_mtx); so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SS_RCVATMARK; + mtx_leave(&so->so_rcv.sb_mtx); sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } Index: sys/nfs/nfs_socket.c =================================================================== RCS file: /cvs/src/sys/nfs/nfs_socket.c,v retrieving revision 1.145 diff -u -p -r1.145 nfs_socket.c --- sys/nfs/nfs_socket.c 5 Feb 2024 20:21:39 -0000 1.145 +++ sys/nfs/nfs_socket.c 13 Feb 2024 14:06:33 -0000 @@ -369,11 +369,17 @@ nfs_connect(struct nfsmount *nmp, struct } solock(so); error = soreserve(so, sndreserve, rcvreserve); + sounlock(so); if (error) - goto bad_locked; + goto bad; + + mtx_enter(&so->so_rcv.sb_mtx); so->so_rcv.sb_flags |= SB_NOINTR; + mtx_leave(&so->so_rcv.sb_mtx); + + mtx_enter(&so->so_snd.sb_mtx); so->so_snd.sb_flags |= SB_NOINTR; - sounlock(so); + mtx_leave(&so->so_snd.sb_mtx); m_freem(mopt); m_freem(nam); Index: sys/nfs/nfs_syscalls.c =================================================================== RCS file: /cvs/src/sys/nfs/nfs_syscalls.c,v retrieving revision 1.121 diff -u -p -r1.121 nfs_syscalls.c --- sys/nfs/nfs_syscalls.c 5 Feb 2024 20:21:39 -0000 1.121 +++ sys/nfs/nfs_syscalls.c 13 Feb 2024 14:06:33 -0000 @@ -275,16 +275,17 @@ nfssvc_addsock(struct file *fp, struct m sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); m_freem(m); } - solock(so); - so->so_rcv.sb_flags &= ~SB_NOINTR; + mtx_enter(&so->so_rcv.sb_mtx); + so->so_rcv.sb_flags &= ~SB_NOINTR; so->so_rcv.sb_timeo_nsecs = INFSLP; mtx_leave(&so->so_rcv.sb_mtx); - so->so_snd.sb_flags &= ~SB_NOINTR; + mtx_enter(&so->so_snd.sb_mtx); + so->so_snd.sb_flags &= ~SB_NOINTR; so->so_snd.sb_timeo_nsecs = INFSLP; mtx_leave(&so->so_snd.sb_mtx); - sounlock(so); + if (tslp) slp = tslp; else { Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.124 diff -u -p -r1.124 socketvar.h --- sys/sys/socketvar.h 12 Feb 2024 22:48:27 -0000 1.124 +++ sys/sys/socketvar.h 13 Feb 2024 14:06:34 -0000 @@ -49,6 +49,11 @@ typedef __socklen_t socklen_t; /* length type for network syscalls */ #endif +/* + * Locks used to protect struct members: + * m sb_mtx + */ + TAILQ_HEAD(soqhead, socket); /* @@ -120,12 +125,12 @@ struct socket { struct mbuf *sb_mbtail; /* the last mbuf in the chain */ struct mbuf *sb_lastrecord;/* first mbuf of last record in socket buffer */ - short sb_flags; /* flags, see below */ + short sb_flags; /* [m] flags, see below */ /* End area that is zeroed on flush. */ #define sb_endzero sb_flags short sb_state; /* socket state on sockbuf */ - uint64_t sb_timeo_nsecs;/* timeout for read/write */ - struct klist sb_klist; /* process selecting read/write */ + uint64_t sb_timeo_nsecs;/* [m] timeout for read/write */ + struct klist sb_klist; /* [m] process selecting read/write */ } so_rcv, so_snd; #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ #define SB_LOCK 0x01 /* lock on data queue */ @@ -221,8 +226,6 @@ static inline int sb_notify(struct socket *so, struct sockbuf *sb) { int rv; - - soassertlocked(so); mtx_enter(&sb->sb_mtx); rv = ((sb->sb_flags & (SB_WAIT|SB_ASYNC|SB_SPLICE)) != 0 ||