Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: send socket buffer mutex in tcp_mss_update()
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Tue, 24 Dec 2024 20:36:20 +0300

Download raw body.

Thread

> On 24 Dec 2024, at 20:31, Alexander Bluhm <alexander.bluhm@gmx.net> wrote:
> 
> On Tue, Dec 24, 2024 at 07:33:47PM +0300, Vitaliy Makkoveev wrote:
>> On Tue, Dec 24, 2024 at 02:43:37PM +0100, Alexander Bluhm wrote:
>>> Hi,
>>> 
>>> I found this fix in my larger TCP output unlocking diff.  Vitaliy,
>>> you address the same issue in your larger socket buffer mutex diff.
>>> I think it is better release the mutex before calling tcp_mss().
>>> There may be another route lookup, and I want to keep that without
>>> needless mutex.
>>> 
>>> ok?
>> 
>> You are right, lock is useless for the "bufsize < mss" branch. However,
>> isn't it better to update and commit my `so_snd' unlocking diff instead
>> of the only hunk?
> 
> I come from the other direction.  My goal is to set PR_MPSOCKET for
> TCP.  And the diff I sent is the final chunk that is missing to
> make parallel tcp_output() work.
> 
> As I see panics with the diff below, I would like to get the
> tcp_mss_update() chunk in independently.  This makes it easier to
> test my TCP PR_MPSOCKET diff and your TCP SB_MTXLOCK diff.  My diff
> is stable, yours not.  Maybe we should review and commit the diff
> below in smaller chunks to understand easier why my test machine
> panics.
> 
> bluhm
> 

Sure, commit this diff as is.

>> Index: sys/kern/uipc_socket.c
>> ===================================================================
>> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
>> diff -u -p -r1.347 uipc_socket.c
>> --- sys/kern/uipc_socket.c	19 Dec 2024 22:11:35 -0000	1.347
>> +++ sys/kern/uipc_socket.c	24 Dec 2024 16:22:36 -0000
>> @@ -153,27 +153,8 @@ soalloc(const struct protosw *prp, int w
>> 	TAILQ_INIT(&so->so_q0);
>> 	TAILQ_INIT(&so->so_q);
>> 
>> -	switch (dp->dom_family) {
>> -	case AF_INET:
>> -	case AF_INET6:
>> -		switch (prp->pr_type) {
>> -		case SOCK_RAW:
>> -		case SOCK_DGRAM:
>> -			so->so_snd.sb_flags |= SB_MTXLOCK;
>> -			/* FALLTHROUGH */
>> -		case SOCK_STREAM:
>> -			so->so_rcv.sb_flags |= SB_MTXLOCK;
>> -			break;
>> -		}
>> -		break;
>> -	case AF_KEY:
>> -	case AF_ROUTE:
>> -	case AF_UNIX:
>> -	case AF_FRAME:
>> -		so->so_snd.sb_flags |= SB_MTXLOCK;
>> -		so->so_rcv.sb_flags |= SB_MTXLOCK;
>> -		break;
>> -	}
>> +	so->so_snd.sb_flags |= SB_MTXLOCK;
>> +	so->so_rcv.sb_flags |= SB_MTXLOCK;
>> 
>> 	return (so);
>> }
>> @@ -327,17 +308,13 @@ sofree(struct socket *so, int keep_lock)
>> 			sounlock(head);
>> 	}
>> 
>> -	switch (so->so_proto->pr_domain->dom_family) {
>> -	case AF_INET:
>> -	case AF_INET6:
>> -		if (so->so_proto->pr_type == SOCK_STREAM)
>> -			break;
>> -		/* FALLTHROUGH */
>> -	default:
>> +	if (!keep_lock) {
>> +		/*
>> +		 * sofree() was called from soclose(). Sleep is safe
>> +		 * even for tcp(4) sockets.
>> +		 */
>> 		sounlock(so);
>> 		refcnt_finalize(&so->so_refcnt, "sofinal");
>> -		solock(so);
>> -		break;
>> 	}
>> 
>> 	sigio_free(&so->so_sigio);
>> @@ -358,9 +335,6 @@ sofree(struct socket *so, int keep_lock)
>> 		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
>> 	m_purge(so->so_rcv.sb_mb);
>> 
>> -	if (!keep_lock)
>> -		sounlock(so);
>> -
>> #ifdef SOCKET_SPLICE
>> 	if (so->so_sp) {
>> 		/* Reuse splice idle, sounsplice() has been called before. */
>> @@ -458,31 +432,6 @@ discard:
>> 	if (so->so_sp) {
>> 		struct socket *soback;
>> 
>> -		if (so->so_proto->pr_flags & PR_WANTRCVD) {
>> -			/*
>> -			 * Copy - Paste, but can't relock and sleep in
>> -			 * sofree() in tcp(4) case. That's why tcp(4)
>> -			 * still rely on solock() for splicing and
>> -			 * unsplicing.
>> -			 */
>> -
>> -			if (issplicedback(so)) {
>> -				int freeing = SOSP_FREEING_WRITE;
>> -
>> -				if (so->so_sp->ssp_soback == so)
>> -					freeing |= SOSP_FREEING_READ;
>> -				sounsplice(so->so_sp->ssp_soback, so, freeing);
>> -			}
>> -			if (isspliced(so)) {
>> -				int freeing = SOSP_FREEING_READ;
>> -
>> -				if (so == so->so_sp->ssp_socket)
>> -					freeing |= SOSP_FREEING_WRITE;
>> -				sounsplice(so, so->so_sp->ssp_socket, freeing);
>> -			}
>> -			goto free;
>> -		}
>> -
>> 		sounlock(so);
>> 		mtx_enter(&so->so_snd.sb_mtx);
>> 		/*
>> @@ -530,7 +479,6 @@ notsplicedback:
>> 
>> 		solock(so);
>> 	}
>> -free:
>> #endif /* SOCKET_SPLICE */
>> 	/* sofree() calls sounlock(). */
>> 	sofree(so, 0);
>> @@ -1587,17 +1535,22 @@ sotask(void *arg)
>> 	 */
>> 
>> 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
>> -	if (sockstream)
>> -		solock(so);
>> -
>> 	if (so->so_rcv.sb_flags & SB_SPLICE) {
>> -		if (sockstream)
>> +		struct socket *sosp = so->so_sp->ssp_socket;
>> +
>> +		if (sockstream) {
>> +			sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR);
>> +			solock(so);
>> 			doyield = 1;
>> +		}
>> +
>> 		somove(so, M_DONTWAIT);
>> -	}
>> 
>> -	if (sockstream)
>> -		sounlock(so);
>> +		if (sockstream) {
>> +			sounlock(so);
>> +			sbunlock(&sosp->so_snd);
>> +		}
>> +	}
>> 	sbunlock(&so->so_rcv);
>> 
>> 	if (doyield) {
>> Index: sys/netinet/tcp_input.c
>> ===================================================================
>> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
>> diff -u -p -r1.411 tcp_input.c
>> --- sys/netinet/tcp_input.c	24 Dec 2024 12:22:17 -0000	1.411
>> +++ sys/netinet/tcp_input.c	24 Dec 2024 16:22:36 -0000
>> @@ -957,7 +957,10 @@ findpcb:
>> 				    acked);
>> 				tp->t_rcvacktime = now;
>> 				ND6_HINT(tp);
>> +
>> +				mtx_enter(&so->so_snd.sb_mtx);
>> 				sbdrop(so, &so->so_snd, acked);
>> +				mtx_leave(&so->so_snd.sb_mtx);
>> 
>> 				/*
>> 				 * If we had a pending ICMP message that
>> @@ -1738,10 +1741,14 @@ trimthenstep6:
>> 				tp->snd_wnd -= so->so_snd.sb_cc;
>> 			else
>> 				tp->snd_wnd = 0;
>> +			mtx_enter(&so->so_snd.sb_mtx);
>> 			sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc);
>> +			mtx_leave(&so->so_snd.sb_mtx);
>> 			ourfinisacked = 1;
>> 		} else {
>> +			mtx_enter(&so->so_snd.sb_mtx);
>> 			sbdrop(so, &so->so_snd, acked);
>> +			mtx_leave(&so->so_snd.sb_mtx);
>> 			if (tp->snd_wnd > acked)
>> 				tp->snd_wnd -= acked;
>> 			else
>> @@ -2995,8 +3002,10 @@ tcp_mss_update(struct tcpcb *tp)
>> 	if (rt == NULL)
>> 		return;
>> 
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> 	bufsize = so->so_snd.sb_hiwat;
>> 	if (bufsize < mss) {
>> +		mtx_leave(&so->so_snd.sb_mtx);
>> 		mss = bufsize;
>> 		/* Update t_maxseg and t_maxopd */
>> 		tcp_mss(tp, mss);
>> @@ -3005,6 +3014,7 @@ tcp_mss_update(struct tcpcb *tp)
>> 		if (bufsize > sb_max)
>> 			bufsize = sb_max;
>> 		(void)sbreserve(so, &so->so_snd, bufsize);
>> +		mtx_leave(&so->so_snd.sb_mtx);
>> 	}
>> 
>> 	mtx_enter(&so->so_rcv.sb_mtx);
>> Index: sys/netinet/tcp_output.c
>> ===================================================================
>> RCS file: /cvs/src/sys/netinet/tcp_output.c,v
>> diff -u -p -r1.146 tcp_output.c
>> --- sys/netinet/tcp_output.c	19 Dec 2024 22:11:35 -0000	1.146
>> +++ sys/netinet/tcp_output.c	24 Dec 2024 16:22:36 -0000
>> @@ -202,7 +202,7 @@ tcp_output(struct tcpcb *tp)
>> 	u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))];
>> 	u_char *opt = (u_char *)optbuf;
>> 	unsigned int optlen, hdrlen, packetlen;
>> -	int idle, sendalot = 0;
>> +	int doing_sosend, idle, sendalot = 0;
>> 	int i, sack_rxmit = 0;
>> 	struct sackhole *p;
>> 	uint64_t now;
>> @@ -227,6 +227,10 @@ tcp_output(struct tcpcb *tp)
>> 
>> 	now = tcp_now();
>> 
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> +	doing_sosend=soissending(so);
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> +
>> 	/*
>> 	 * Determine length of data that should be transmitted,
>> 	 * and flags that will be used.
>> @@ -243,7 +247,7 @@ tcp_output(struct tcpcb *tp)
>> 		tp->snd_cwnd = 2 * tp->t_maxseg;
>> 
>> 	/* remember 'idle' for next invocation of tcp_output */
>> -	if (idle && soissending(so)) {
>> +	if (idle && doing_sosend) {
>> 		tp->t_flags |= TF_LASTIDLE;
>> 		idle = 0;
>> 	} else
>> @@ -392,7 +396,7 @@ again:
>> 		if (len >= txmaxseg)
>> 			goto send;
>> 		if ((idle || (tp->t_flags & TF_NODELAY)) &&
>> -		    len + off >= so->so_snd.sb_cc && !soissending(so) &&
>> +		    len + off >= so->so_snd.sb_cc && !doing_sosend &&
>> 		    (tp->t_flags & TF_NOPUSH) == 0)
>> 			goto send;
>> 		if (tp->t_force)
>> @@ -725,7 +729,7 @@ send:
>> 		 * give data to the user when a buffer fills or
>> 		 * a PUSH comes in.)
>> 		 */
>> -		if (off + len == so->so_snd.sb_cc && !soissending(so))
>> +		if (off + len == so->so_snd.sb_cc && !doing_sosend)
>> 			flags |= TH_PUSH;
>> 		tp->t_sndtime = now;
>> 	} else {
>> Index: sys/netinet/tcp_usrreq.c
>> ===================================================================
>> RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
>> diff -u -p -r1.233 tcp_usrreq.c
>> --- sys/netinet/tcp_usrreq.c	19 Dec 2024 22:11:35 -0000	1.233
>> +++ sys/netinet/tcp_usrreq.c	24 Dec 2024 16:22:36 -0000
>> @@ -302,10 +302,12 @@ tcp_fill_info(struct tcpcb *tp, struct s
>> 	ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
>> 	ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
>> 	mtx_leave(&so->so_rcv.sb_mtx);
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> 	ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
>> 	ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
>> 	ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
>> 	ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> 
>> 	return 0;
>> }
>> @@ -842,7 +844,9 @@ tcp_send(struct socket *so, struct mbuf 
>> 	if (so->so_options & SO_DEBUG)
>> 		ostate = tp->t_state;
>> 
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> 	sbappendstream(so, &so->so_snd, m);
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> 	m = NULL;
>> 
>> 	error = tcp_output(tp);
>> @@ -895,7 +899,9 @@ tcp_sense(struct socket *so, struct stat
>> 	if ((error = tcp_sogetpcb(so, &inp, &tp)))
>> 		return (error);
>> 
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> 	ub->st_blksize = so->so_snd.sb_hiwat;
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> 
>> 	if (so->so_options & SO_DEBUG)
>> 		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
>> @@ -970,7 +976,9 @@ tcp_sendoob(struct socket *so, struct mb
>> 	 * of data past the urgent section.
>> 	 * Otherwise, snd_up should be one lower.
>> 	 */
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> 	sbappendstream(so, &so->so_snd, m);
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> 	m = NULL;
>> 	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
>> 	tp->t_force = 1;
>> @@ -1519,7 +1527,11 @@ void
>> tcp_update_sndspace(struct tcpcb *tp)
>> {
>> 	struct socket *so = tp->t_inpcb->inp_socket;
>> -	u_long nmax = so->so_snd.sb_hiwat;
>> +	u_long nmax;
>> +
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> +
>> +	nmax = so->so_snd.sb_hiwat;
>> 
>> 	if (sbchecklowmem()) {
>> 		/* low on memory try to get rid of some */
>> @@ -1535,7 +1547,7 @@ tcp_update_sndspace(struct tcpcb *tp)
>> 	}
>> 
>> 	/* a writable socket must be preserved because of poll(2) semantics */
>> -	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
>> +	if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) {
>> 		if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
>> 			nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
>> 		/* keep in sync with sbreserve() calculation */
>> @@ -1548,6 +1560,8 @@ tcp_update_sndspace(struct tcpcb *tp)
>> 
>> 	if (nmax != so->so_snd.sb_hiwat)
>> 		sbreserve(so, &so->so_snd, nmax);
>> +
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> }
>> 
>> /*
>> @@ -1581,9 +1595,11 @@ tcp_update_rcvspace(struct tcpcb *tp)
>> 	}
>> 
>> 	/* a readable socket must be preserved because of poll(2) semantics */
>> +	mtx_enter(&so->so_snd.sb_mtx);
>> 	if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
>> 	    nmax < so->so_snd.sb_lowat)
>> 		nmax = so->so_snd.sb_lowat;
>> +	mtx_leave(&so->so_snd.sb_mtx);
>> 
>> 	if (nmax != so->so_rcv.sb_hiwat) {
>> 		/* round to MSS boundary */