From: Vitaliy Makkoveev Subject: tcp(4): soreceive() with shared netlock To: Alexander Bluhm , tech@openbsd.org Date: Mon, 12 Feb 2024 02:39:57 +0300 We only need to protect `so_rcv' with `sb_mtx' mutex(9) within soreceive() and corresponding sbappendstream() within PCB layer. This time the SS_CANTRCVMORE check could be left unlocked at PCB layer, but I want do it locked from now. I propose this diff mostly for tests. At least I don't like the relocking around pru_rcvd(), but if the benefits will be significant, we could keep it for a while. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.319 diff -u -p -r1.319 uipc_socket.c --- sys/kern/uipc_socket.c 11 Feb 2024 21:36:49 -0000 1.319 +++ sys/kern/uipc_socket.c 11 Feb 2024 23:38:28 -0000 @@ -153,12 +153,7 @@ soalloc(const struct domain *dp, int wai switch (dp->dom_family) { case AF_INET: case AF_INET6: - switch (dp->dom_protosw->pr_type) { - case SOCK_DGRAM: - case SOCK_RAW: - so->so_rcv.sb_flags |= SB_MTXLOCK; - break; - } + so->so_rcv.sb_flags |= SB_MTXLOCK; break; } @@ -831,10 +826,10 @@ bad: if (mp) *mp = NULL; - solock_shared(so); + solock_recv(so); restart: if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { - sounlock_shared(so); + sounlock_recv(so); return (error); } sb_mtx_lock(&so->so_rcv); @@ -974,11 +969,11 @@ dontblock: if (controlp) { if (pr->pr_domain->dom_externalize) { sb_mtx_unlock(&so->so_rcv); - sounlock_shared(so); + sounlock_recv(so); error = (*pr->pr_domain->dom_externalize) (cm, controllen, flags); - solock_shared(so); + solock_recv(so); sb_mtx_lock(&so->so_rcv); } *controlp = cm; @@ -1054,9 +1049,9 @@ dontblock: SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; sb_mtx_unlock(&so->so_rcv); - sounlock_shared(so); + sounlock_recv(so); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); - solock_shared(so); + solock_recv(so); sb_mtx_lock(&so->so_rcv); if (uio_error) uio->uio_resid = resid - len; @@ -1173,8 +1168,19 @@ dontblock: } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); - if (pr->pr_flags & PR_WANTRCVD) + if (pr->pr_flags & PR_WANTRCVD) { + sb_mtx_unlock(&so->so_rcv); + if (so->so_proto->pr_protocol != AF_UNIX) { + sounlock_recv(so); + solock(so); + } pru_rcvd(so); + if (so->so_proto->pr_protocol != AF_UNIX) { + sounlock(so); + solock_recv(so); + } + sb_mtx_lock(&so->so_rcv); + } } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && @@ -1192,7 +1198,7 @@ dontblock: release: sb_mtx_unlock(&so->so_rcv); sbunlock(so, &so->so_rcv); - sounlock_shared(so); + sounlock_recv(so); return (error); } Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.143 diff -u -p -r1.143 uipc_socket2.c --- sys/kern/uipc_socket2.c 11 Feb 2024 18:14:26 -0000 1.143 +++ sys/kern/uipc_socket2.c 11 Feb 2024 23:38:28 -0000 @@ -382,6 +382,21 @@ solock_shared(struct socket *so) } } +void +solock_recv(struct socket *so) +{ + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + NET_LOCK_SHARED(); + rw_enter_write(&so->so_lock); + break; + default: + rw_enter_write(&so->so_lock); + break; + } +} + int solock_persocket(struct socket *so) { @@ -443,6 +458,21 @@ sounlock_shared(struct socket *so) } void +sounlock_recv(struct socket *so) +{ + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + rw_exit_write(&so->so_lock); + NET_UNLOCK_SHARED(); + break; + default: + rw_exit_write(&so->so_lock); + break; + } +} + +void soassertlocked_readonly(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { @@ -486,15 +516,11 @@ sosleep_nsec(struct socket *so, void *id switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: - if (so->so_proto->pr_usrreqs->pru_unlock != NULL && - rw_status(&netlock) == RW_READ) { + if (rw_status(&netlock) == RW_READ) rw_exit_write(&so->so_lock); - } ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - if (so->so_proto->pr_usrreqs->pru_lock != NULL && - rw_status(&netlock) == RW_READ) { + if (rw_status(&netlock) == RW_READ) rw_enter_write(&so->so_lock); - } break; default: ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); @@ -852,8 +878,8 @@ sbappend(struct socket *so, struct sockb void sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) { - KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - soassertlocked(so); + sbmtxassertlocked(so, sb); + KDASSERT(m->m_nextpkt == NULL); KASSERT(sb->sb_mb == sb->sb_lastrecord); Index: sys/netinet/tcp_input.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.400 diff -u -p -r1.400 tcp_input.c --- sys/netinet/tcp_input.c 11 Feb 2024 01:27:45 -0000 1.400 +++ sys/netinet/tcp_input.c 11 Feb 2024 23:38:28 -0000 @@ -340,10 +340,12 @@ tcp_flush_queue(struct tcpcb *tp) nq = TAILQ_NEXT(q, tcpqe_q); TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); ND6_HINT(tp); + mtx_enter(&so->so_rcv.sb_mtx); if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(q->tcpqe_m); else sbappendstream(so, &so->so_rcv, q->tcpqe_m); + mtx_leave(&so->so_rcv.sb_mtx); pool_put(&tcpqe_pool, q); q = nq; } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); @@ -1041,6 +1043,7 @@ findpcb: * Drop TCP, IP headers and TCP options then add data * to socket buffer. */ + mtx_enter(&so->so_rcv.sb_mtx); if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(m); else { @@ -1056,6 +1059,7 @@ findpcb: m_adj(m, iphlen + off); sbappendstream(so, &so->so_rcv, m); } + mtx_leave(&so->so_rcv.sb_mtx); tp->t_flags |= TF_BLOCKOUTPUT; sorwakeup(so); tp->t_flags &= ~TF_BLOCKOUTPUT; @@ -1944,12 +1948,14 @@ dodata: /* XXX */ tiflags = th->th_flags & TH_FIN; tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); ND6_HINT(tp); + mtx_enter(&so->so_rcv.sb_mtx); if (so->so_rcv.sb_state & SS_CANTRCVMORE) m_freem(m); else { m_adj(m, hdroptlen); sbappendstream(so, &so->so_rcv, m); } + mtx_leave(&so->so_rcv.sb_mtx); tp->t_flags |= TF_BLOCKOUTPUT; sorwakeup(so); tp->t_flags &= ~TF_BLOCKOUTPUT; Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.123 diff -u -p -r1.123 socketvar.h --- sys/sys/socketvar.h 11 Feb 2024 18:14:27 -0000 1.123 +++ sys/sys/socketvar.h 11 Feb 2024 23:38:28 -0000 @@ -402,10 +402,12 @@ int sockargs(struct mbuf **, const void int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t); void solock(struct socket *); void solock_shared(struct socket *); +void solock_recv(struct socket *); int solock_persocket(struct socket *); void solock_pair(struct socket *, struct socket *); void sounlock(struct socket *); void sounlock_shared(struct socket *); +void sounlock_recv(struct socket *); int sendit(struct proc *, int, struct msghdr *, int, register_t *); int recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);