Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
tcp(4): soreceive() with shared netlock
To:
Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Mon, 12 Feb 2024 02:39:57 +0300

Download raw body.

Thread
  • Vitaliy Makkoveev:

    tcp(4): soreceive() with shared netlock

We only need to protect `so_rcv' with `sb_mtx' mutex(9) within
soreceive() and corresponding sbappendstream() within PCB layer.  

This time the SS_CANTRCVMORE check could be left unlocked at PCB layer,
but I want do it locked from now.

I propose this diff mostly for tests. At least I don't like the
relocking around pru_rcvd(), but if the benefits will be significant,
we could keep it for a while.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.319
diff -u -p -r1.319 uipc_socket.c
--- sys/kern/uipc_socket.c	11 Feb 2024 21:36:49 -0000	1.319
+++ sys/kern/uipc_socket.c	11 Feb 2024 23:38:28 -0000
@@ -153,12 +153,7 @@ soalloc(const struct domain *dp, int wai
 	switch (dp->dom_family) {
 	case AF_INET:
 	case AF_INET6:
-		switch (dp->dom_protosw->pr_type) {
-		case SOCK_DGRAM:
-		case SOCK_RAW:
-			so->so_rcv.sb_flags |= SB_MTXLOCK;
-			break;
-		}
+		so->so_rcv.sb_flags |= SB_MTXLOCK;
 		break;
 	}
 
@@ -831,10 +826,10 @@ bad:
 	if (mp)
 		*mp = NULL;
 
-	solock_shared(so);
+	solock_recv(so);
 restart:
 	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
-		sounlock_shared(so);
+		sounlock_recv(so);
 		return (error);
 	}
 	sb_mtx_lock(&so->so_rcv);
@@ -974,11 +969,11 @@ dontblock:
 			if (controlp) {
 				if (pr->pr_domain->dom_externalize) {
 					sb_mtx_unlock(&so->so_rcv);
-					sounlock_shared(so);
+					sounlock_recv(so);
 					error =
 					    (*pr->pr_domain->dom_externalize)
 					    (cm, controllen, flags);
-					solock_shared(so);
+					solock_recv(so);
 					sb_mtx_lock(&so->so_rcv);
 				}
 				*controlp = cm;
@@ -1054,9 +1049,9 @@ dontblock:
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 			resid = uio->uio_resid;
 			sb_mtx_unlock(&so->so_rcv);
-			sounlock_shared(so);
+			sounlock_recv(so);
 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
-			solock_shared(so);
+			solock_recv(so);
 			sb_mtx_lock(&so->so_rcv);
 			if (uio_error)
 				uio->uio_resid = resid - len;
@@ -1173,8 +1168,19 @@ dontblock:
 		}
 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
-		if (pr->pr_flags & PR_WANTRCVD)
+		if (pr->pr_flags & PR_WANTRCVD) {
+			sb_mtx_unlock(&so->so_rcv);
+			if (so->so_proto->pr_protocol != AF_UNIX) {
+				sounlock_recv(so);
+				solock(so);
+			}
 			pru_rcvd(so);
+			if (so->so_proto->pr_protocol != AF_UNIX) {
+				sounlock(so);
+				solock_recv(so);
+			}
+			sb_mtx_lock(&so->so_rcv);
+		}
 	}
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 &&
@@ -1192,7 +1198,7 @@ dontblock:
 release:
 	sb_mtx_unlock(&so->so_rcv);
 	sbunlock(so, &so->so_rcv);
-	sounlock_shared(so);
+	sounlock_recv(so);
 	return (error);
 }
 
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.143
diff -u -p -r1.143 uipc_socket2.c
--- sys/kern/uipc_socket2.c	11 Feb 2024 18:14:26 -0000	1.143
+++ sys/kern/uipc_socket2.c	11 Feb 2024 23:38:28 -0000
@@ -382,6 +382,21 @@ solock_shared(struct socket *so)
 	}
 }
 
+void
+solock_recv(struct socket *so)
+{
+	switch (so->so_proto->pr_domain->dom_family) {
+	case PF_INET:
+	case PF_INET6:
+		NET_LOCK_SHARED();
+		rw_enter_write(&so->so_lock);
+		break;
+	default:
+		rw_enter_write(&so->so_lock);
+		break;
+	}
+}
+
 int
 solock_persocket(struct socket *so)
 {
@@ -443,6 +458,21 @@ sounlock_shared(struct socket *so)
 }
 
 void
+sounlock_recv(struct socket *so)
+{
+	switch (so->so_proto->pr_domain->dom_family) {
+	case PF_INET:
+	case PF_INET6:
+		rw_exit_write(&so->so_lock);
+		NET_UNLOCK_SHARED();
+		break;
+	default:
+		rw_exit_write(&so->so_lock);
+		break;
+	}
+}
+
+void
 soassertlocked_readonly(struct socket *so)
 {
 	switch (so->so_proto->pr_domain->dom_family) {
@@ -486,15 +516,11 @@ sosleep_nsec(struct socket *so, void *id
 	switch (so->so_proto->pr_domain->dom_family) {
 	case PF_INET:
 	case PF_INET6:
-		if (so->so_proto->pr_usrreqs->pru_unlock != NULL &&
-		    rw_status(&netlock) == RW_READ) {
+		if (rw_status(&netlock) == RW_READ)
 			rw_exit_write(&so->so_lock);
-		}
 		ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
-		if (so->so_proto->pr_usrreqs->pru_lock != NULL &&
-		    rw_status(&netlock) == RW_READ) {
+		if (rw_status(&netlock) == RW_READ)
 			rw_enter_write(&so->so_lock);
-		}
 		break;
 	default:
 		ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
@@ -852,8 +878,8 @@ sbappend(struct socket *so, struct sockb
 void
 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
 {
-	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
-	soassertlocked(so);
+	sbmtxassertlocked(so, sb);
+
 	KDASSERT(m->m_nextpkt == NULL);
 	KASSERT(sb->sb_mb == sb->sb_lastrecord);
 
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.400
diff -u -p -r1.400 tcp_input.c
--- sys/netinet/tcp_input.c	11 Feb 2024 01:27:45 -0000	1.400
+++ sys/netinet/tcp_input.c	11 Feb 2024 23:38:28 -0000
@@ -340,10 +340,12 @@ tcp_flush_queue(struct tcpcb *tp)
 		nq = TAILQ_NEXT(q, tcpqe_q);
 		TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
 		ND6_HINT(tp);
+		mtx_enter(&so->so_rcv.sb_mtx);
 		if (so->so_rcv.sb_state & SS_CANTRCVMORE)
 			m_freem(q->tcpqe_m);
 		else
 			sbappendstream(so, &so->so_rcv, q->tcpqe_m);
+		mtx_leave(&so->so_rcv.sb_mtx);
 		pool_put(&tcpqe_pool, q);
 		q = nq;
 	} while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
@@ -1041,6 +1043,7 @@ findpcb:
 			 * Drop TCP, IP headers and TCP options then add data
 			 * to socket buffer.
 			 */
+			mtx_enter(&so->so_rcv.sb_mtx);
 			if (so->so_rcv.sb_state & SS_CANTRCVMORE)
 				m_freem(m);
 			else {
@@ -1056,6 +1059,7 @@ findpcb:
 				m_adj(m, iphlen + off);
 				sbappendstream(so, &so->so_rcv, m);
 			}
+			mtx_leave(&so->so_rcv.sb_mtx);
 			tp->t_flags |= TF_BLOCKOUTPUT;
 			sorwakeup(so);
 			tp->t_flags &= ~TF_BLOCKOUTPUT;
@@ -1944,12 +1948,14 @@ dodata:							/* XXX */
 			tiflags = th->th_flags & TH_FIN;
 			tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
 			ND6_HINT(tp);
+			mtx_enter(&so->so_rcv.sb_mtx);
 			if (so->so_rcv.sb_state & SS_CANTRCVMORE)
 				m_freem(m);
 			else {
 				m_adj(m, hdroptlen);
 				sbappendstream(so, &so->so_rcv, m);
 			}
+			mtx_leave(&so->so_rcv.sb_mtx);
 			tp->t_flags |= TF_BLOCKOUTPUT;
 			sorwakeup(so);
 			tp->t_flags &= ~TF_BLOCKOUTPUT;
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.123
diff -u -p -r1.123 socketvar.h
--- sys/sys/socketvar.h	11 Feb 2024 18:14:27 -0000	1.123
+++ sys/sys/socketvar.h	11 Feb 2024 23:38:28 -0000
@@ -402,10 +402,12 @@ int	sockargs(struct mbuf **, const void 
 int	sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
 void	solock(struct socket *);
 void	solock_shared(struct socket *);
+void	solock_recv(struct socket *);
 int	solock_persocket(struct socket *);
 void	solock_pair(struct socket *, struct socket *);
 void	sounlock(struct socket *);
 void	sounlock_shared(struct socket *);
+void	sounlock_recv(struct socket *);
 
 int	sendit(struct proc *, int, struct msghdr *, int, register_t *);
 int	recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);