Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: tcp(4): use per-sockbuf mutex to protect `so_snd' socket buffer
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
OpenBSD tech <tech@openbsd.org>
Date:
Fri, 27 Dec 2024 14:44:46 +0300

Download raw body.

Thread
On Thu, Dec 26, 2024 at 06:43:29PM +0300, Vitaliy Makkoveev wrote:
> > On 26 Dec 2024, at 15:47, Alexander Bluhm <bluhm@openbsd.org> wrote:
> > 
> > On Tue, Dec 24, 2024 at 10:21:55PM +0300, Vitaliy Makkoveev wrote:
> >> This is the updated diff. It uses barriers instead of `ssp_idleto'
> >> timeout and `ssp_task' task redefinition.
> > 
> > I have tested this diff with my regression, performance, and network
> > setup.  Works fine.
> > 
> > OK bluhm@
> > 
> > Could you wait one day before commiting this?  Then we have a
> > snapshot between my and your unlocking diff.  If something goes
> > wrong, it will be easier to bisect.
> 
> Thanks for testing! Will commit this weekend or Monday.
> 

The updated diff. It doesn't contain already committed tcp_mss_update()
and unsplicing hunks.

Note, in sofree() I intentionally left "so->so_sp" check within
"if (!keep-lock)" block.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.349 uipc_socket.c
--- sys/kern/uipc_socket.c	27 Dec 2024 10:18:04 -0000	1.349
+++ sys/kern/uipc_socket.c	27 Dec 2024 11:36:04 -0000
@@ -151,27 +151,8 @@ soalloc(const struct protosw *prp, int w
 	TAILQ_INIT(&so->so_q0);
 	TAILQ_INIT(&so->so_q);
 
-	switch (dp->dom_family) {
-	case AF_INET:
-	case AF_INET6:
-		switch (prp->pr_type) {
-		case SOCK_RAW:
-		case SOCK_DGRAM:
-			so->so_snd.sb_flags |= SB_MTXLOCK;
-			/* FALLTHROUGH */
-		case SOCK_STREAM:
-			so->so_rcv.sb_flags |= SB_MTXLOCK;
-			break;
-		}
-		break;
-	case AF_KEY:
-	case AF_ROUTE:
-	case AF_UNIX:
-	case AF_FRAME:
-		so->so_snd.sb_flags |= SB_MTXLOCK;
-		so->so_rcv.sb_flags |= SB_MTXLOCK;
-		break;
-	}
+	so->so_snd.sb_flags |= SB_MTXLOCK;
+	so->so_rcv.sb_flags |= SB_MTXLOCK;
 
 	return (so);
 }
@@ -332,7 +313,6 @@ sofree(struct socket *so, int keep_lock)
 		 */
 		sounlock(so);
 		refcnt_finalize(&so->so_refcnt, "sofinal");
-		solock(so);
 	}
 
 	sigio_free(&so->so_sigio);
@@ -354,8 +334,6 @@ sofree(struct socket *so, int keep_lock)
 	m_purge(so->so_rcv.sb_mb);
 
 	if (!keep_lock) {
-		sounlock(so);
-
 #ifdef SOCKET_SPLICE
 		if (so->so_sp) {
 			timeout_del_barrier(&so->so_sp->ssp_idleto);
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.413 tcp_input.c
--- sys/netinet/tcp_input.c	26 Dec 2024 12:16:17 -0000	1.413
+++ sys/netinet/tcp_input.c	27 Dec 2024 11:36:04 -0000
@@ -957,7 +957,10 @@ findpcb:
 				    acked);
 				tp->t_rcvacktime = now;
 				ND6_HINT(tp);
+
+				mtx_enter(&so->so_snd.sb_mtx);
 				sbdrop(so, &so->so_snd, acked);
+				mtx_leave(&so->so_snd.sb_mtx);
 
 				/*
 				 * If we had a pending ICMP message that
@@ -1738,10 +1741,14 @@ trimthenstep6:
 				tp->snd_wnd -= so->so_snd.sb_cc;
 			else
 				tp->snd_wnd = 0;
+			mtx_enter(&so->so_snd.sb_mtx);
 			sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc);
+			mtx_leave(&so->so_snd.sb_mtx);
 			ourfinisacked = 1;
 		} else {
+			mtx_enter(&so->so_snd.sb_mtx);
 			sbdrop(so, &so->so_snd, acked);
+			mtx_leave(&so->so_snd.sb_mtx);
 			if (tp->snd_wnd > acked)
 				tp->snd_wnd -= acked;
 			else
Index: sys/netinet/tcp_output.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_output.c,v
diff -u -p -r1.147 tcp_output.c
--- sys/netinet/tcp_output.c	26 Dec 2024 12:16:17 -0000	1.147
+++ sys/netinet/tcp_output.c	27 Dec 2024 11:36:04 -0000
@@ -200,7 +200,7 @@ tcp_output(struct tcpcb *tp)
 	u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))];
 	u_char *opt = (u_char *)optbuf;
 	unsigned int optlen, hdrlen, packetlen;
-	int idle, sendalot = 0;
+	int doing_sosend, idle, sendalot = 0;
 	int i, sack_rxmit = 0;
 	struct sackhole *p;
 	uint64_t now;
@@ -225,6 +225,10 @@ tcp_output(struct tcpcb *tp)
 
 	now = tcp_now();
 
+	mtx_enter(&so->so_snd.sb_mtx);
+	doing_sosend=soissending(so);
+	mtx_leave(&so->so_snd.sb_mtx);
+
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
@@ -241,7 +245,7 @@ tcp_output(struct tcpcb *tp)
 		tp->snd_cwnd = 2 * tp->t_maxseg;
 
 	/* remember 'idle' for next invocation of tcp_output */
-	if (idle && soissending(so)) {
+	if (idle && doing_sosend) {
 		tp->t_flags |= TF_LASTIDLE;
 		idle = 0;
 	} else
@@ -390,7 +394,7 @@ again:
 		if (len >= txmaxseg)
 			goto send;
 		if ((idle || (tp->t_flags & TF_NODELAY)) &&
-		    len + off >= so->so_snd.sb_cc && !soissending(so) &&
+		    len + off >= so->so_snd.sb_cc && !doing_sosend &&
 		    (tp->t_flags & TF_NOPUSH) == 0)
 			goto send;
 		if (tp->t_force)
@@ -723,7 +727,7 @@ send:
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
-		if (off + len == so->so_snd.sb_cc && !soissending(so))
+		if (off + len == so->so_snd.sb_cc && !doing_sosend)
 			flags |= TH_PUSH;
 		tp->t_sndtime = now;
 	} else {
Index: sys/netinet/tcp_usrreq.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
diff -u -p -r1.233 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c	19 Dec 2024 22:11:35 -0000	1.233
+++ sys/netinet/tcp_usrreq.c	27 Dec 2024 11:36:04 -0000
@@ -302,10 +302,12 @@ tcp_fill_info(struct tcpcb *tp, struct s
 	ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
 	ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
 	mtx_leave(&so->so_rcv.sb_mtx);
+	mtx_enter(&so->so_snd.sb_mtx);
 	ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
 	ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
 	ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
 	ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
+	mtx_leave(&so->so_snd.sb_mtx);
 
 	return 0;
 }
@@ -842,7 +844,9 @@ tcp_send(struct socket *so, struct mbuf 
 	if (so->so_options & SO_DEBUG)
 		ostate = tp->t_state;
 
+	mtx_enter(&so->so_snd.sb_mtx);
 	sbappendstream(so, &so->so_snd, m);
+	mtx_leave(&so->so_snd.sb_mtx);
 	m = NULL;
 
 	error = tcp_output(tp);
@@ -895,7 +899,9 @@ tcp_sense(struct socket *so, struct stat
 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
 		return (error);
 
+	mtx_enter(&so->so_snd.sb_mtx);
 	ub->st_blksize = so->so_snd.sb_hiwat;
+	mtx_leave(&so->so_snd.sb_mtx);
 
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
@@ -970,7 +976,9 @@ tcp_sendoob(struct socket *so, struct mb
 	 * of data past the urgent section.
 	 * Otherwise, snd_up should be one lower.
 	 */
+	mtx_enter(&so->so_snd.sb_mtx);
 	sbappendstream(so, &so->so_snd, m);
+	mtx_leave(&so->so_snd.sb_mtx);
 	m = NULL;
 	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
 	tp->t_force = 1;
@@ -1519,7 +1527,11 @@ void
 tcp_update_sndspace(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
-	u_long nmax = so->so_snd.sb_hiwat;
+	u_long nmax;
+
+	mtx_enter(&so->so_snd.sb_mtx);
+
+	nmax = so->so_snd.sb_hiwat;
 
 	if (sbchecklowmem()) {
 		/* low on memory try to get rid of some */
@@ -1535,7 +1547,7 @@ tcp_update_sndspace(struct tcpcb *tp)
 	}
 
 	/* a writable socket must be preserved because of poll(2) semantics */
-	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
+	if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) {
 		if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
 			nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
 		/* keep in sync with sbreserve() calculation */
@@ -1548,6 +1560,8 @@ tcp_update_sndspace(struct tcpcb *tp)
 
 	if (nmax != so->so_snd.sb_hiwat)
 		sbreserve(so, &so->so_snd, nmax);
+
+	mtx_leave(&so->so_snd.sb_mtx);
 }
 
 /*
@@ -1581,9 +1595,11 @@ tcp_update_rcvspace(struct tcpcb *tp)
 	}
 
 	/* a readable socket must be preserved because of poll(2) semantics */
+	mtx_enter(&so->so_snd.sb_mtx);
 	if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
 	    nmax < so->so_snd.sb_lowat)
 		nmax = so->so_snd.sb_lowat;
+	mtx_leave(&so->so_snd.sb_mtx);
 
 	if (nmax != so->so_rcv.sb_hiwat) {
 		/* round to MSS boundary */