Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: send socket buffer mutex in tcp_mss_update()
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Tue, 24 Dec 2024 18:31:27 +0100

Download raw body.

Thread
On Tue, Dec 24, 2024 at 07:33:47PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Dec 24, 2024 at 02:43:37PM +0100, Alexander Bluhm wrote:
> > Hi,
> > 
> > I found this fix in my larger TCP output unlocking diff.  Vitaliy,
> > you address the same issue in your larger socket buffer mutex diff.
> > I think it is better release the mutex before calling tcp_mss().
> > There may be another route lookup, and I want to keep that without
> > needless mutex.
> > 
> > ok?
> 
> You are right, lock is useless for the "bufsize < mss" branch. However,
> isn't it better to update and commit my `so_snd' unlocking diff instead
> of the only hunk?

I come from the other direction.  My goal is to set PR_MPSOCKET for
TCP.  And the diff I sent is the final chunk that is missing to
make parallel tcp_output() work.

As I see panics with the diff below, I would like to get the
tcp_mss_update() chunk in independently.  This makes it easier to
test my TCP PR_MPSOCKET diff and your TCP SB_MTXLOCK diff.  My diff
is stable, yours not.  Maybe we should review and commit the diff
below in smaller chunks to understand easier why my test machine
panics.

bluhm

> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> diff -u -p -r1.347 uipc_socket.c
> --- sys/kern/uipc_socket.c	19 Dec 2024 22:11:35 -0000	1.347
> +++ sys/kern/uipc_socket.c	24 Dec 2024 16:22:36 -0000
> @@ -153,27 +153,8 @@ soalloc(const struct protosw *prp, int w
>  	TAILQ_INIT(&so->so_q0);
>  	TAILQ_INIT(&so->so_q);
>  
> -	switch (dp->dom_family) {
> -	case AF_INET:
> -	case AF_INET6:
> -		switch (prp->pr_type) {
> -		case SOCK_RAW:
> -		case SOCK_DGRAM:
> -			so->so_snd.sb_flags |= SB_MTXLOCK;
> -			/* FALLTHROUGH */
> -		case SOCK_STREAM:
> -			so->so_rcv.sb_flags |= SB_MTXLOCK;
> -			break;
> -		}
> -		break;
> -	case AF_KEY:
> -	case AF_ROUTE:
> -	case AF_UNIX:
> -	case AF_FRAME:
> -		so->so_snd.sb_flags |= SB_MTXLOCK;
> -		so->so_rcv.sb_flags |= SB_MTXLOCK;
> -		break;
> -	}
> +	so->so_snd.sb_flags |= SB_MTXLOCK;
> +	so->so_rcv.sb_flags |= SB_MTXLOCK;
>  
>  	return (so);
>  }
> @@ -327,17 +308,13 @@ sofree(struct socket *so, int keep_lock)
>  			sounlock(head);
>  	}
>  
> -	switch (so->so_proto->pr_domain->dom_family) {
> -	case AF_INET:
> -	case AF_INET6:
> -		if (so->so_proto->pr_type == SOCK_STREAM)
> -			break;
> -		/* FALLTHROUGH */
> -	default:
> +	if (!keep_lock) {
> +		/*
> +		 * sofree() was called from soclose(). Sleep is safe
> +		 * even for tcp(4) sockets.
> +		 */
>  		sounlock(so);
>  		refcnt_finalize(&so->so_refcnt, "sofinal");
> -		solock(so);
> -		break;
>  	}
>  
>  	sigio_free(&so->so_sigio);
> @@ -358,9 +335,6 @@ sofree(struct socket *so, int keep_lock)
>  		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
>  	m_purge(so->so_rcv.sb_mb);
>  
> -	if (!keep_lock)
> -		sounlock(so);
> -
>  #ifdef SOCKET_SPLICE
>  	if (so->so_sp) {
>  		/* Reuse splice idle, sounsplice() has been called before. */
> @@ -458,31 +432,6 @@ discard:
>  	if (so->so_sp) {
>  		struct socket *soback;
>  
> -		if (so->so_proto->pr_flags & PR_WANTRCVD) {
> -			/*
> -			 * Copy - Paste, but can't relock and sleep in
> -			 * sofree() in tcp(4) case. That's why tcp(4)
> -			 * still rely on solock() for splicing and
> -			 * unsplicing.
> -			 */
> -
> -			if (issplicedback(so)) {
> -				int freeing = SOSP_FREEING_WRITE;
> -
> -				if (so->so_sp->ssp_soback == so)
> -					freeing |= SOSP_FREEING_READ;
> -				sounsplice(so->so_sp->ssp_soback, so, freeing);
> -			}
> -			if (isspliced(so)) {
> -				int freeing = SOSP_FREEING_READ;
> -
> -				if (so == so->so_sp->ssp_socket)
> -					freeing |= SOSP_FREEING_WRITE;
> -				sounsplice(so, so->so_sp->ssp_socket, freeing);
> -			}
> -			goto free;
> -		}
> -
>  		sounlock(so);
>  		mtx_enter(&so->so_snd.sb_mtx);
>  		/*
> @@ -530,7 +479,6 @@ notsplicedback:
>  
>  		solock(so);
>  	}
> -free:
>  #endif /* SOCKET_SPLICE */
>  	/* sofree() calls sounlock(). */
>  	sofree(so, 0);
> @@ -1587,17 +1535,22 @@ sotask(void *arg)
>  	 */
>  
>  	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
> -	if (sockstream)
> -		solock(so);
> -
>  	if (so->so_rcv.sb_flags & SB_SPLICE) {
> -		if (sockstream)
> +		struct socket *sosp = so->so_sp->ssp_socket;
> +
> +		if (sockstream) {
> +			sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR);
> +			solock(so);
>  			doyield = 1;
> +		}
> +
>  		somove(so, M_DONTWAIT);
> -	}
>  
> -	if (sockstream)
> -		sounlock(so);
> +		if (sockstream) {
> +			sounlock(so);
> +			sbunlock(&sosp->so_snd);
> +		}
> +	}
>  	sbunlock(&so->so_rcv);
>  
>  	if (doyield) {
> Index: sys/netinet/tcp_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> diff -u -p -r1.411 tcp_input.c
> --- sys/netinet/tcp_input.c	24 Dec 2024 12:22:17 -0000	1.411
> +++ sys/netinet/tcp_input.c	24 Dec 2024 16:22:36 -0000
> @@ -957,7 +957,10 @@ findpcb:
>  				    acked);
>  				tp->t_rcvacktime = now;
>  				ND6_HINT(tp);
> +
> +				mtx_enter(&so->so_snd.sb_mtx);
>  				sbdrop(so, &so->so_snd, acked);
> +				mtx_leave(&so->so_snd.sb_mtx);
>  
>  				/*
>  				 * If we had a pending ICMP message that
> @@ -1738,10 +1741,14 @@ trimthenstep6:
>  				tp->snd_wnd -= so->so_snd.sb_cc;
>  			else
>  				tp->snd_wnd = 0;
> +			mtx_enter(&so->so_snd.sb_mtx);
>  			sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc);
> +			mtx_leave(&so->so_snd.sb_mtx);
>  			ourfinisacked = 1;
>  		} else {
> +			mtx_enter(&so->so_snd.sb_mtx);
>  			sbdrop(so, &so->so_snd, acked);
> +			mtx_leave(&so->so_snd.sb_mtx);
>  			if (tp->snd_wnd > acked)
>  				tp->snd_wnd -= acked;
>  			else
> @@ -2995,8 +3002,10 @@ tcp_mss_update(struct tcpcb *tp)
>  	if (rt == NULL)
>  		return;
>  
> +	mtx_enter(&so->so_snd.sb_mtx);
>  	bufsize = so->so_snd.sb_hiwat;
>  	if (bufsize < mss) {
> +		mtx_leave(&so->so_snd.sb_mtx);
>  		mss = bufsize;
>  		/* Update t_maxseg and t_maxopd */
>  		tcp_mss(tp, mss);
> @@ -3005,6 +3014,7 @@ tcp_mss_update(struct tcpcb *tp)
>  		if (bufsize > sb_max)
>  			bufsize = sb_max;
>  		(void)sbreserve(so, &so->so_snd, bufsize);
> +		mtx_leave(&so->so_snd.sb_mtx);
>  	}
>  
>  	mtx_enter(&so->so_rcv.sb_mtx);
> Index: sys/netinet/tcp_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_output.c,v
> diff -u -p -r1.146 tcp_output.c
> --- sys/netinet/tcp_output.c	19 Dec 2024 22:11:35 -0000	1.146
> +++ sys/netinet/tcp_output.c	24 Dec 2024 16:22:36 -0000
> @@ -202,7 +202,7 @@ tcp_output(struct tcpcb *tp)
>  	u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))];
>  	u_char *opt = (u_char *)optbuf;
>  	unsigned int optlen, hdrlen, packetlen;
> -	int idle, sendalot = 0;
> +	int doing_sosend, idle, sendalot = 0;
>  	int i, sack_rxmit = 0;
>  	struct sackhole *p;
>  	uint64_t now;
> @@ -227,6 +227,10 @@ tcp_output(struct tcpcb *tp)
>  
>  	now = tcp_now();
>  
> +	mtx_enter(&so->so_snd.sb_mtx);
> +	doing_sosend=soissending(so);
> +	mtx_leave(&so->so_snd.sb_mtx);
> +
>  	/*
>  	 * Determine length of data that should be transmitted,
>  	 * and flags that will be used.
> @@ -243,7 +247,7 @@ tcp_output(struct tcpcb *tp)
>  		tp->snd_cwnd = 2 * tp->t_maxseg;
>  
>  	/* remember 'idle' for next invocation of tcp_output */
> -	if (idle && soissending(so)) {
> +	if (idle && doing_sosend) {
>  		tp->t_flags |= TF_LASTIDLE;
>  		idle = 0;
>  	} else
> @@ -392,7 +396,7 @@ again:
>  		if (len >= txmaxseg)
>  			goto send;
>  		if ((idle || (tp->t_flags & TF_NODELAY)) &&
> -		    len + off >= so->so_snd.sb_cc && !soissending(so) &&
> +		    len + off >= so->so_snd.sb_cc && !doing_sosend &&
>  		    (tp->t_flags & TF_NOPUSH) == 0)
>  			goto send;
>  		if (tp->t_force)
> @@ -725,7 +729,7 @@ send:
>  		 * give data to the user when a buffer fills or
>  		 * a PUSH comes in.)
>  		 */
> -		if (off + len == so->so_snd.sb_cc && !soissending(so))
> +		if (off + len == so->so_snd.sb_cc && !doing_sosend)
>  			flags |= TH_PUSH;
>  		tp->t_sndtime = now;
>  	} else {
> Index: sys/netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
> diff -u -p -r1.233 tcp_usrreq.c
> --- sys/netinet/tcp_usrreq.c	19 Dec 2024 22:11:35 -0000	1.233
> +++ sys/netinet/tcp_usrreq.c	24 Dec 2024 16:22:36 -0000
> @@ -302,10 +302,12 @@ tcp_fill_info(struct tcpcb *tp, struct s
>  	ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
>  	ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
>  	mtx_leave(&so->so_rcv.sb_mtx);
> +	mtx_enter(&so->so_snd.sb_mtx);
>  	ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
>  	ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
>  	ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
>  	ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
> +	mtx_leave(&so->so_snd.sb_mtx);
>  
>  	return 0;
>  }
> @@ -842,7 +844,9 @@ tcp_send(struct socket *so, struct mbuf 
>  	if (so->so_options & SO_DEBUG)
>  		ostate = tp->t_state;
>  
> +	mtx_enter(&so->so_snd.sb_mtx);
>  	sbappendstream(so, &so->so_snd, m);
> +	mtx_leave(&so->so_snd.sb_mtx);
>  	m = NULL;
>  
>  	error = tcp_output(tp);
> @@ -895,7 +899,9 @@ tcp_sense(struct socket *so, struct stat
>  	if ((error = tcp_sogetpcb(so, &inp, &tp)))
>  		return (error);
>  
> +	mtx_enter(&so->so_snd.sb_mtx);
>  	ub->st_blksize = so->so_snd.sb_hiwat;
> +	mtx_leave(&so->so_snd.sb_mtx);
>  
>  	if (so->so_options & SO_DEBUG)
>  		tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
> @@ -970,7 +976,9 @@ tcp_sendoob(struct socket *so, struct mb
>  	 * of data past the urgent section.
>  	 * Otherwise, snd_up should be one lower.
>  	 */
> +	mtx_enter(&so->so_snd.sb_mtx);
>  	sbappendstream(so, &so->so_snd, m);
> +	mtx_leave(&so->so_snd.sb_mtx);
>  	m = NULL;
>  	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
>  	tp->t_force = 1;
> @@ -1519,7 +1527,11 @@ void
>  tcp_update_sndspace(struct tcpcb *tp)
>  {
>  	struct socket *so = tp->t_inpcb->inp_socket;
> -	u_long nmax = so->so_snd.sb_hiwat;
> +	u_long nmax;
> +
> +	mtx_enter(&so->so_snd.sb_mtx);
> +
> +	nmax = so->so_snd.sb_hiwat;
>  
>  	if (sbchecklowmem()) {
>  		/* low on memory try to get rid of some */
> @@ -1535,7 +1547,7 @@ tcp_update_sndspace(struct tcpcb *tp)
>  	}
>  
>  	/* a writable socket must be preserved because of poll(2) semantics */
> -	if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
> +	if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) {
>  		if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
>  			nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
>  		/* keep in sync with sbreserve() calculation */
> @@ -1548,6 +1560,8 @@ tcp_update_sndspace(struct tcpcb *tp)
>  
>  	if (nmax != so->so_snd.sb_hiwat)
>  		sbreserve(so, &so->so_snd, nmax);
> +
> +	mtx_leave(&so->so_snd.sb_mtx);
>  }
>  
>  /*
> @@ -1581,9 +1595,11 @@ tcp_update_rcvspace(struct tcpcb *tp)
>  	}
>  
>  	/* a readable socket must be preserved because of poll(2) semantics */
> +	mtx_enter(&so->so_snd.sb_mtx);
>  	if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
>  	    nmax < so->so_snd.sb_lowat)
>  		nmax = so->so_snd.sb_lowat;
> +	mtx_leave(&so->so_snd.sb_mtx);
>  
>  	if (nmax != so->so_rcv.sb_hiwat) {
>  		/* round to MSS boundary */