From: Vitaliy Makkoveev Subject: Use `sb_mtx' instead of `inp_mtx' in receive path for inet sockets To: Alexander Bluhm , tech@openbsd.org Date: Sun, 4 Feb 2024 20:40:13 +0300 In soreceve(), we only touch `so_rcv' socket buffer, which has it's own `sb_mtx' mutex(9) for protection. So, we can avoid solock() in this path - it's enough to hold `sb_mtx' in soreceive() and around corresponding sbappend*(). But not right now :) This time we use shared netlock for some inet sockets in the soreceive() path. To protect `so_rcv' buffer we use `inp_mtx' mutex(9) and the prul_lock() to acquire this mutex(9) at socket layer. But the `inp_mtx' mutex belongs to the PCB. We initialize socket before PCB, tcp(4) sockets could exist without PCB, so I want to use different locks for PCB and socket layers of socket. This diff mechanically replaces `inp_mtx' by `sb_mtx' in the receive path. Only for sockets which already use `inp_mtx'. All other sockets left as is. They will be converted later. Since the `sb_mtx' is optional, the new SB_MTXLOCK flag introduced. If this flag is set on `sb_flags', the `sb_mtx' mutex(9) should be taken. New sb_mtx_lock() and sb_mtx_unlock() was introduced to hide this check. They are temporary and will be replaced by mtx_enter() when all this area will be converted to `sb_mtx' mutex(9). Also, the new sbmtxassertlocked() function introduced to throw corresponding assertion for SB_MTXLOCK marked buffers. This time only sbappendaddr() calls it. This function is also temporary and will be replaced by MTX_ASSERT_LOCKED() later. This naming could be strange, bu now we sblock() for different purpose, so I want to keep it as is for a while. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.316 diff -u -p -r1.316 uipc_socket.c --- sys/kern/uipc_socket.c 3 Feb 2024 22:50:08 -0000 1.316 +++ sys/kern/uipc_socket.c 4 Feb 2024 17:08:47 -0000 @@ -150,6 +150,18 @@ soalloc(const struct domain *dp, int wai TAILQ_INIT(&so->so_q0); TAILQ_INIT(&so->so_q); + switch (dp->dom_family) { + case AF_INET: + case AF_INET6: + switch (dp->dom_protosw->pr_type) { + case SOCK_DGRAM: + case SOCK_RAW: + so->so_rcv.sb_flags |= SB_MTXLOCK; + break; + } + break; + } + return (so); } @@ -825,7 +837,7 @@ restart: sounlock_shared(so); return (error); } - pru_lock(so); + sb_mtx_lock(&so->so_rcv); m = so->so_rcv.sb_mb; #ifdef SOCKET_SPLICE @@ -889,7 +901,7 @@ restart: SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(so, &so->so_rcv); - pru_unlock(so); + sb_mtx_unlock(&so->so_rcv); error = sbwait(so, &so->so_rcv); if (error) { sounlock_shared(so); @@ -961,13 +973,13 @@ dontblock: sbsync(&so->so_rcv, nextrecord); if (controlp) { if (pr->pr_domain->dom_externalize) { - pru_unlock(so); + sb_mtx_unlock(&so->so_rcv); sounlock_shared(so); error = (*pr->pr_domain->dom_externalize) (cm, controllen, flags); solock_shared(so); - pru_lock(so); + sb_mtx_lock(&so->so_rcv); } *controlp = cm; } else { @@ -1041,11 +1053,11 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; - pru_unlock(so); + sb_mtx_unlock(&so->so_rcv); sounlock_shared(so); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); solock_shared(so); - pru_lock(so); + sb_mtx_lock(&so->so_rcv); if (uio_error) uio->uio_resid = resid - len; } else @@ -1127,14 +1139,14 @@ dontblock: break; SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - pru_unlock(so); + sb_mtx_unlock(&so->so_rcv); error = sbwait(so, &so->so_rcv); if (error) { sbunlock(so, &so->so_rcv); sounlock_shared(so); return (0); } - pru_lock(so); + sb_mtx_lock(&so->so_rcv); if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; } @@ -1168,7 +1180,7 @@ dontblock: (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { sbunlock(so, &so->so_rcv); - pru_unlock(so); + sb_mtx_unlock(&so->so_rcv); goto restart; } @@ -1179,7 +1191,7 @@ dontblock: *flagsp |= flags; release: sbunlock(so, &so->so_rcv); - pru_unlock(so); + sb_mtx_unlock(&so->so_rcv); sounlock_shared(so); return (error); } Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.141 diff -u -p -r1.141 uipc_socket2.c --- sys/kern/uipc_socket2.c 3 Feb 2024 22:50:08 -0000 1.141 +++ sys/kern/uipc_socket2.c 4 Feb 2024 17:08:47 -0000 @@ -500,6 +500,16 @@ sosleep_nsec(struct socket *so, void *id return ret; } +void +sbmtxassertlocked(struct socket *so, struct sockbuf *sb) +{ + if (sb->sb_flags & SB_MTXLOCK) { + if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0) + splassert_fail(0, RW_WRITE, __func__); + } else + soassertlocked(so); +} + /* * Wait for data to arrive at/drain from a socket buffer. */ @@ -919,7 +929,7 @@ sbappendaddr(struct socket *so, struct s struct mbuf *m, *n, *nlast; int space = asa->sa_len; - soassertlocked(so); + sbmtxassertlocked(so, sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); Index: sys/netinet/ip_divert.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_divert.c,v retrieving revision 1.93 diff -u -p -r1.93 ip_divert.c --- sys/netinet/ip_divert.c 3 Feb 2024 22:50:09 -0000 1.93 +++ sys/netinet/ip_divert.c 4 Feb 2024 17:08:47 -0000 @@ -241,14 +241,14 @@ divert_packet(struct mbuf *m, int dir, u in_proto_cksum_out(m, NULL); } - mtx_enter(&inp->inp_mtx); so = inp->inp_socket; + mtx_enter(&so->so_rcv.sb_mtx); if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) { - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); divstat_inc(divs_fullsock); goto bad; } - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); sorwakeup(so); in_pcbunref(inp); Index: sys/netinet/ip_mroute.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_mroute.c,v retrieving revision 1.140 diff -u -p -r1.140 ip_mroute.c --- sys/netinet/ip_mroute.c 6 Dec 2023 09:27:17 -0000 1.140 +++ sys/netinet/ip_mroute.c 4 Feb 2024 17:08:47 -0000 @@ -1051,12 +1051,11 @@ int socket_send(struct socket *so, struct mbuf *mm, struct sockaddr_in *src) { if (so != NULL) { - struct inpcb *inp = sotoinpcb(so); int ret; - mtx_enter(&inp->inp_mtx); + mtx_enter(&so->so_rcv.sb_mtx); ret = sbappendaddr(so, &so->so_rcv, sintosa(src), mm, NULL); - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); if (ret != 0) { sorwakeup(so); Index: sys/netinet/raw_ip.c =================================================================== RCS file: /cvs/src/sys/netinet/raw_ip.c,v retrieving revision 1.155 diff -u -p -r1.155 raw_ip.c --- sys/netinet/raw_ip.c 3 Feb 2024 22:50:09 -0000 1.155 +++ sys/netinet/raw_ip.c 4 Feb 2024 17:08:47 -0000 @@ -220,24 +220,24 @@ rip_input(struct mbuf **mp, int *offp, i else n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) { + struct socket *so = inp->inp_socket; int ret; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip_savecontrol(inp, &opts, ip, n); - mtx_enter(&inp->inp_mtx); - ret = sbappendaddr(inp->inp_socket, - &inp->inp_socket->so_rcv, + mtx_enter(&so->so_rcv.sb_mtx); + ret = sbappendaddr(so, &so->so_rcv, sintosa(&ripsrc), n, opts); - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); if (ret == 0) { /* should notify about lost packet */ m_freem(n); m_freem(opts); } else - sorwakeup(inp->inp_socket); + sorwakeup(so); } in_pcbunref(inp); } Index: sys/netinet/udp_usrreq.c =================================================================== RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v retrieving revision 1.317 diff -u -p -r1.317 udp_usrreq.c --- sys/netinet/udp_usrreq.c 3 Feb 2024 22:50:09 -0000 1.317 +++ sys/netinet/udp_usrreq.c 4 Feb 2024 17:08:47 -0000 @@ -697,15 +697,15 @@ udp_sbappend(struct inpcb *inp, struct m #endif m_adj(m, hlen); - mtx_enter(&inp->inp_mtx); + mtx_enter(&so->so_rcv.sb_mtx); if (sbappendaddr(so, &so->so_rcv, srcaddr, m, opts) == 0) { - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); udpstat_inc(udps_fullsock); m_freem(m); m_freem(opts); return; } - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); sorwakeup(so); } Index: sys/netinet6/ip6_divert.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_divert.c,v retrieving revision 1.92 diff -u -p -r1.92 ip6_divert.c --- sys/netinet6/ip6_divert.c 3 Feb 2024 22:50:09 -0000 1.92 +++ sys/netinet6/ip6_divert.c 4 Feb 2024 17:08:47 -0000 @@ -248,14 +248,14 @@ divert6_packet(struct mbuf *m, int dir, in6_proto_cksum_out(m, NULL); } - mtx_enter(&inp->inp_mtx); so = inp->inp_socket; + mtx_enter(&so->so_rcv.sb_mtx); if (sbappendaddr(so, &so->so_rcv, sin6tosa(&sin6), m, NULL) == 0) { - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); div6stat_inc(div6s_fullsock); goto bad; } - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); sorwakeup(so); in_pcbunref(inp); Index: sys/netinet6/ip6_mroute.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_mroute.c,v retrieving revision 1.139 diff -u -p -r1.139 ip6_mroute.c --- sys/netinet6/ip6_mroute.c 3 Feb 2024 22:50:09 -0000 1.139 +++ sys/netinet6/ip6_mroute.c 4 Feb 2024 17:08:47 -0000 @@ -856,17 +856,16 @@ int socket6_send(struct socket *so, struct mbuf *mm, struct sockaddr_in6 *src) { if (so != NULL) { - struct inpcb *inp = sotoinpcb(so); int ret; - mtx_enter(&inp->inp_mtx); + mtx_enter(&so->so_rcv.sb_mtx); ret = sbappendaddr(so, &so->so_rcv, sin6tosa(src), mm, NULL); - if (ret != 0) - sorwakeup(so); - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); - if (ret != 0) + if (ret != 0) { + sorwakeup(so); return 0; + } } m_freem(mm); return -1; Index: sys/netinet6/raw_ip6.c =================================================================== RCS file: /cvs/src/sys/netinet6/raw_ip6.c,v retrieving revision 1.180 diff -u -p -r1.180 raw_ip6.c --- sys/netinet6/raw_ip6.c 3 Feb 2024 22:50:09 -0000 1.180 +++ sys/netinet6/raw_ip6.c 4 Feb 2024 17:08:47 -0000 @@ -263,6 +263,7 @@ rip6_input(struct mbuf **mp, int *offp, else n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) { + struct socket *so = inp->inp_socket; int ret; if (inp->inp_flags & IN6P_CONTROLOPTS) @@ -270,11 +271,10 @@ rip6_input(struct mbuf **mp, int *offp, /* strip intermediate headers */ m_adj(n, *offp); - mtx_enter(&inp->inp_mtx); - ret = sbappendaddr(inp->inp_socket, - &inp->inp_socket->so_rcv, + mtx_enter(&so->so_rcv.sb_mtx); + ret = sbappendaddr(so, &so->so_rcv, sin6tosa(&rip6src), n, opts); - mtx_leave(&inp->inp_mtx); + mtx_leave(&so->so_rcv.sb_mtx); if (ret == 0) { /* should notify about lost packet */ @@ -282,7 +282,7 @@ rip6_input(struct mbuf **mp, int *offp, m_freem(opts); rip6stat_inc(rip6s_fullsock); } else - sorwakeup(inp->inp_socket); + sorwakeup(so); } in_pcbunref(inp); } Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.122 diff -u -p -r1.122 socketvar.h --- sys/sys/socketvar.h 3 Feb 2024 22:50:09 -0000 1.122 +++ sys/sys/socketvar.h 4 Feb 2024 17:08:47 -0000 @@ -134,6 +134,7 @@ struct socket { #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_SPLICE 0x20 /* buffer is splice source or drain */ #define SB_NOINTR 0x40 /* operations not interruptible */ +#define SB_MTXLOCK 0x80 /* use sb_mtx for sockbuf protection */ void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ @@ -196,6 +197,22 @@ sorele(struct socket *so) #define isspliced(so) ((so)->so_sp && (so)->so_sp->ssp_socket) #define issplicedback(so) ((so)->so_sp && (so)->so_sp->ssp_soback) + +static inline void +sb_mtx_lock(struct sockbuf *sb) +{ + if (sb->sb_flags & SB_MTXLOCK) + mtx_enter(&sb->sb_mtx); +} + +static inline void +sb_mtx_unlock(struct sockbuf *sb) +{ + if (sb->sb_flags & SB_MTXLOCK) + mtx_leave(&sb->sb_mtx); +} + +void sbmtxassertlocked(struct socket *so, struct sockbuf *); /* * Do we need to notify the other side when I/O is possible?