Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Unlock udp(4) somove()
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Sat, 20 Jul 2024 13:27:37 +0200

Download raw body.

Thread
On Fri, Jul 19, 2024 at 09:47:34PM +0300, Vitaliy Makkoveev wrote:
> Understood. This one releases `sb_mtx' mutexes only around pru_rcvd(). 

Some comments

> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> diff -u -p -r1.338 uipc_socket.c
> --- sys/kern/uipc_socket.c	14 Jul 2024 15:42:23 -0000	1.338
> +++ sys/kern/uipc_socket.c	19 Jul 2024 18:42:26 -0000
> @@ -324,31 +324,22 @@ sofree(struct socket *so, int keep_lock)
>  			sounlock(head);
>  	}
>  
> -	if (persocket) {
> +	switch (so->so_proto->pr_domain->dom_family) {
> +	case AF_INET:
> +	case AF_INET6:
> +		if (so->so_proto->pr_type == SOCK_STREAM)
> +			break;
> +		/* FALLTHROUGH */
> +	default:
>  		sounlock(so);
>  		refcnt_finalize(&so->so_refcnt, "sofinal");
>  		solock(so);
> +		break;
>  	}
>  
>  	sigio_free(&so->so_sigio);
>  	klist_free(&so->so_rcv.sb_klist);
>  	klist_free(&so->so_snd.sb_klist);
> -#ifdef SOCKET_SPLICE
> -	if (issplicedback(so)) {
> -		int freeing = SOSP_FREEING_WRITE;
> -
> -		if (so->so_sp->ssp_soback == so)
> -			freeing |= SOSP_FREEING_READ;
> -		sounsplice(so->so_sp->ssp_soback, so, freeing);
> -	}
> -	if (isspliced(so)) {
> -		int freeing = SOSP_FREEING_READ;
> -
> -		if (so == so->so_sp->ssp_socket)
> -			freeing |= SOSP_FREEING_WRITE;
> -		sounsplice(so, so->so_sp->ssp_socket, freeing);
> -	}
> -#endif /* SOCKET_SPLICE */
>  
>  	mtx_enter(&so->so_snd.sb_mtx);
>  	sbrelease(so, &so->so_snd);
> @@ -458,6 +449,83 @@ discard:
>  	if (so->so_state & SS_NOFDREF)
>  		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
>  	so->so_state |= SS_NOFDREF;
> +
> +#ifdef SOCKET_SPLICE
> +	if (so->so_sp) {
> +		struct socket *soback;
> +
> +		if (so->so_proto->pr_flags & PR_WANTRCVD) {
> +			/*
> +			 * Copy - Paste, but can't relock an sleep in

typo: a sleep

> +			 * sofree() in tcp(4) case. That's why tcp(4)
> +			 * still rely on solock() for splicing and
> +			 * unsplicing.
> +			 */
> +
> +			if (issplicedback(so)) {
> +				int freeing = SOSP_FREEING_WRITE;
> +
> +				if (so->so_sp->ssp_soback == so)
> +					freeing |= SOSP_FREEING_READ;
> +				sounsplice(so->so_sp->ssp_soback, so, freeing);
> +			}
> +			if (isspliced(so)) {
> +				int freeing = SOSP_FREEING_READ;
> +
> +				if (so == so->so_sp->ssp_socket)
> +					freeing |= SOSP_FREEING_WRITE;
> +				sounsplice(so, so->so_sp->ssp_socket, freeing);
> +			}
> +			goto free;
> +		}
> +
> +		sounlock(so);
> +		mtx_enter(&so->so_snd.sb_mtx);
> +		/*
> +		 * Concurrent sounsplice() locks `sb_mtx' mutextes on

typo: mutexes

> +		 * both `so_snd' and `so_rcv' before unsplice sockets.
> +		 */
> +		if ((soback = so->so_sp->ssp_soback) == NULL) {
> +			mtx_leave(&so->so_snd.sb_mtx);
> +			goto notsplicedback;
> +		}
> +		soref(soback);
> +		mtx_leave(&so->so_snd.sb_mtx);
> +
> +		sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR);
> +		/*
> +		 * sblock() is always taken on `so_rcv' before call
> +		 * sounsplice(). `so' is dying and can be only unspliced.
> +		 */
> +		if (issplicedback(so)) {
> +			int freeing = SOSP_FREEING_WRITE;
> +
> +			if (so->so_sp->ssp_soback == so)
> +				freeing |= SOSP_FREEING_READ;
> +			solock(soback);
> +			sounsplice(so->so_sp->ssp_soback, so, freeing);
> +			sounlock(soback);
> +		}
> +		sbunlock(&soback->so_rcv);
> +		sorele(soback);

Here you mix soback and so->so_sp->ssp_soback.  Are they the same?
Between soback = so->so_sp->ssp_soback and unsplicing so->so_sp->ssp_soback
there is sounlock(so), mtx_leave(&so->so_snd.sb_mtx), and
sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR) may sleep.

> +
> +notsplicedback:
> +		sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
> +		if (isspliced(so)) {
> +			int freeing = SOSP_FREEING_READ;
> +
> +			if (so == so->so_sp->ssp_socket)
> +				freeing |= SOSP_FREEING_WRITE;
> +			solock(so);
> +			sounsplice(so, so->so_sp->ssp_socket, freeing);
> +			sounlock(so);
> +		}
> +		sbunlock(&so->so_rcv);
> +
> +		solock(so);
> +	}
> +free:
> +#endif /* SOCKET_SPLICE */
>  	/* sofree() calls sounlock(). */
>  	sofree(so, 0);
>  	return (error);
> @@ -1411,14 +1479,6 @@ sosplice(struct socket *so, int fd, off_
>  		goto release;
>  	}
>  
> -	/* Splice so and sosp together. */
> -	mtx_enter(&so->so_rcv.sb_mtx);
> -	mtx_enter(&sosp->so_snd.sb_mtx);
> -	so->so_sp->ssp_socket = sosp;
> -	sosp->so_sp->ssp_soback = so;
> -	mtx_leave(&sosp->so_snd.sb_mtx);
> -	mtx_leave(&so->so_rcv.sb_mtx);
> -
>  	so->so_splicelen = 0;
>  	so->so_splicemax = max;
>  	if (tv)
> @@ -1429,9 +1489,20 @@ sosplice(struct socket *so, int fd, off_
>  	task_set(&so->so_splicetask, sotask, so);
>  
>  	/*
> -	 * To prevent softnet interrupt from calling somove() while
> -	 * we sleep, the socket buffers are not marked as spliced yet.
> +	 * To prevent sorwakeup() calling somove() before this somove()
> +	 * has finished, the socket buffers are not marked as spliced yet.
>  	 */
> +
> +	/* Splice so and sosp together. */
> +	mtx_enter(&so->so_rcv.sb_mtx);
> +	mtx_enter(&sosp->so_snd.sb_mtx);
> +	so->so_sp->ssp_socket = sosp;
> +	sosp->so_sp->ssp_soback = so;
> +	mtx_leave(&sosp->so_snd.sb_mtx);
> +	mtx_leave(&so->so_rcv.sb_mtx);
> +
> +	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
> +		sounlock(so);
>  	if (somove(so, M_WAIT)) {
>  		mtx_enter(&so->so_rcv.sb_mtx);
>  		mtx_enter(&sosp->so_snd.sb_mtx);
> @@ -1440,6 +1511,8 @@ sosplice(struct socket *so, int fd, off_
>  		mtx_leave(&sosp->so_snd.sb_mtx);
>  		mtx_leave(&so->so_rcv.sb_mtx);
>  	}
> +	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
> +		solock(so);
>  
>   release:
>  	sounlock(so);
> @@ -1454,6 +1527,8 @@ sosplice(struct socket *so, int fd, off_
>  void
>  sounsplice(struct socket *so, struct socket *sosp, int freeing)
>  {
> +	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
> +		sbassertlocked(&so->so_rcv);
>  	soassertlocked(so);
>  
>  	task_del(sosplice_taskq, &so->so_splicetask);
> @@ -1479,32 +1554,51 @@ soidle(void *arg)
>  {
>  	struct socket *so = arg;
>  
> +	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
>  	solock(so);
> +	/*
> +	 * Depending on socket type, sblock(&so->so_rcv) or solock()
> +	 * is always held while modifying SB_SPLICE and
> +	 * so->so_sp->ssp_socket.
> +	 */
>  	if (so->so_rcv.sb_flags & SB_SPLICE) {
>  		so->so_error = ETIMEDOUT;
>  		sounsplice(so, so->so_sp->ssp_socket, 0);
>  	}
>  	sounlock(so);
> +	sbunlock(&so->so_rcv);
>  }
>  
>  void
>  sotask(void *arg)
>  {
>  	struct socket *so = arg;
> +	int doyield = 0;
> +	int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD);
> +
> +	/*
> +	 * sblock() on `so_rcv' protects sockets from beind unspliced
> +	 * for UDP case. TCP sockets still rely on solock(). 
> +	 */
> +
> +	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
> +	if (sockstream)
> +		solock(so);
>  
> -	solock(so);
>  	if (so->so_rcv.sb_flags & SB_SPLICE) {
> -		/*
> -		 * We may not sleep here as sofree() and unsplice() may be
> -		 * called from softnet interrupt context.  This would remove
> -		 * the socket during somove().
> -		 */
> +		if (sockstream)
> +			doyield = 1;
>  		somove(so, M_DONTWAIT);
>  	}
> -	sounlock(so);
>  
> -	/* Avoid user land starvation. */
> -	yield();
> +	if (sockstream)
> +		sounlock(so);
> +	sbunlock(&so->so_rcv);
> +
> +	if (doyield) {
> +		/* Avoid user land starvation. */
> +		yield();
> +	}
>  }
>  
>  /*
> @@ -1546,25 +1640,30 @@ somove(struct socket *so, int wait)
>  	struct mbuf	*m, **mp, *nextrecord;
>  	u_long		 len, off, oobmark;
>  	long		 space;
> -	int		 error = 0, maxreached = 0;
> +	int		 error = 0, maxreached = 0, unsplice = 0;
>  	unsigned int	 rcvstate;
> +	int		 sockdgram = ((so->so_proto->pr_flags &
> +			     PR_WANTRCVD) == 0);
>  
> -	soassertlocked(so);
> +	if (sockdgram)
> +		sbassertlocked(&so->so_rcv);
> +	else
> +		soassertlocked(so);
> +
> +	mtx_enter(&so->so_rcv.sb_mtx);
> +	mtx_enter(&sosp->so_snd.sb_mtx);
>  
>   nextpkt:
> -	if (so->so_error) {
> -		error = so->so_error;
> +	if ((error = READ_ONCE(so->so_error)))
>  		goto release;
> -	}
>  	if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
>  		error = EPIPE;
>  		goto release;
>  	}
> -	if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
> -	    sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
> -		error = sosp->so_error;
> +
> +	error = READ_ONCE(sosp->so_error);
> +	if (error && error != ETIMEDOUT && error != EFBIG && error != ELOOP)
>  		goto release;
> -	}

error may be != 0 here.  There are some goto release below that do
not set error.  You sould set error = 0 here.

>  	if ((sosp->so_state & SS_ISCONNECTED) == 0)
>  		goto release;
>  
> @@ -1577,26 +1676,22 @@ somove(struct socket *so, int wait)
>  			maxreached = 1;
>  		}
>  	}
> -	mtx_enter(&sosp->so_snd.sb_mtx);
>  	space = sbspace_locked(sosp, &sosp->so_snd);
>  	if (so->so_oobmark && so->so_oobmark < len &&
>  	    so->so_oobmark < space + 1024)
>  		space += 1024;
>  	if (space <= 0) {
> -		mtx_leave(&sosp->so_snd.sb_mtx);
>  		maxreached = 0;
>  		goto release;
>  	}
>  	if (space < len) {
>  		maxreached = 0;
>  		if (space < sosp->so_snd.sb_lowat) {
> -			mtx_leave(&sosp->so_snd.sb_mtx);
>  			goto release;
>  		}

You may remove the { }

>  		len = space;
>  	}
>  	sosp->so_snd.sb_state |= SS_ISSENDING;
> -	mtx_leave(&sosp->so_snd.sb_mtx);
>  
>  	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
>  	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
> @@ -1618,8 +1713,13 @@ somove(struct socket *so, int wait)
>  		m = m->m_next;
>  	if (m == NULL) {
>  		sbdroprecord(so, &so->so_rcv);
> -		if (so->so_proto->pr_flags & PR_WANTRCVD)
> +		if (so->so_proto->pr_flags & PR_WANTRCVD) {
> +			mtx_leave(&sosp->so_snd.sb_mtx);
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  			pru_rcvd(so);
> +			mtx_enter(&so->so_rcv.sb_mtx);
> +			mtx_enter(&sosp->so_snd.sb_mtx);
> +		}
>  		goto nextpkt;
>  	}
>  
> @@ -1724,11 +1824,15 @@ somove(struct socket *so, int wait)
>  	}
>  
>  	/* Send window update to source peer as receive buffer has changed. */
> -	if (so->so_proto->pr_flags & PR_WANTRCVD)
> +	if (so->so_proto->pr_flags & PR_WANTRCVD) {
> +		mtx_leave(&sosp->so_snd.sb_mtx);
> +		mtx_leave(&so->so_rcv.sb_mtx);
>  		pru_rcvd(so);
> +		mtx_enter(&so->so_rcv.sb_mtx);
> +		mtx_enter(&sosp->so_snd.sb_mtx);
> +	}
>  
>  	/* Receive buffer did shrink by len bytes, adjust oob. */
> -	mtx_enter(&so->so_rcv.sb_mtx);
>  	rcvstate = so->so_rcv.sb_state;
>  	so->so_rcv.sb_state &= ~SS_RCVATMARK;
>  	oobmark = so->so_oobmark;
> @@ -1739,7 +1843,6 @@ somove(struct socket *so, int wait)
>  		if (oobmark >= len)
>  			oobmark = 0;
>  	}
> -	mtx_leave(&so->so_rcv.sb_mtx);
>  
>  	/*
>  	 * Handle oob data.  If any malloc fails, ignore error.
> @@ -1755,7 +1858,12 @@ somove(struct socket *so, int wait)
>  		} else if (oobmark) {
>  			o = m_split(m, oobmark, wait);
>  			if (o) {
> +				mtx_leave(&sosp->so_snd.sb_mtx);
> +				mtx_leave(&so->so_rcv.sb_mtx);
>  				error = pru_send(sosp, m, NULL, NULL);
> +				mtx_enter(&so->so_rcv.sb_mtx);
> +				mtx_enter(&sosp->so_snd.sb_mtx);
> +
>  				if (error) {
>  					if (sosp->so_snd.sb_state &
>  					    SS_CANTSENDMORE)
> @@ -1773,7 +1881,13 @@ somove(struct socket *so, int wait)
>  		if (o) {
>  			o->m_len = 1;
>  			*mtod(o, caddr_t) = *mtod(m, caddr_t);
> +		
> +			mtx_leave(&sosp->so_snd.sb_mtx);
> +			mtx_leave(&so->so_rcv.sb_mtx);
>  			error = pru_sendoob(sosp, o, NULL, NULL);
> +			mtx_enter(&so->so_rcv.sb_mtx);
> +			mtx_enter(&sosp->so_snd.sb_mtx);
> +
>  			if (error) {
>  				if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
>  					error = EPIPE;
> @@ -1791,15 +1905,24 @@ somove(struct socket *so, int wait)
>  		}
>  	}
>  
> -	mtx_enter(&sosp->so_snd.sb_mtx);
>  	/* Append all remaining data to drain socket. */
>  	if (so->so_rcv.sb_cc == 0 || maxreached)
>  		sosp->so_snd.sb_state &= ~SS_ISSENDING;
> +
>  	mtx_leave(&sosp->so_snd.sb_mtx);
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  
> +	if (sockdgram)
> +		solock_shared(sosp);
>  	error = pru_send(sosp, m, NULL, NULL);
> +	if (sockdgram)
> +		sounlock_shared(sosp);
> +
> +	mtx_enter(&so->so_rcv.sb_mtx);
> +	mtx_enter(&sosp->so_snd.sb_mtx);
> +	
>  	if (error) {
> -		if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
> +		if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 
>  			error = EPIPE;
>  		goto release;
>  	}
> @@ -1810,26 +1933,35 @@ somove(struct socket *so, int wait)
>  		goto nextpkt;
>  
>   release:
> -	mtx_enter(&sosp->so_snd.sb_mtx);
>  	sosp->so_snd.sb_state &= ~SS_ISSENDING;
> -	mtx_leave(&sosp->so_snd.sb_mtx);
>  
>  	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
>  		error = EFBIG;
>  	if (error)
> -		so->so_error = error;
> +		WRITE_ONCE(so->so_error, error);
> +
>  	if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
>  	    so->so_rcv.sb_cc == 0) ||
>  	    (sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
> -	    maxreached || error) {
> +	    maxreached || error)
> +		unsplice = 1;
> +
> +	mtx_leave(&sosp->so_snd.sb_mtx);
> +	mtx_leave(&so->so_rcv.sb_mtx);
> +
> +	if (unsplice) {
> +		if (sockdgram)
> +			solock(so);
>  		sounsplice(so, sosp, 0);
> +		if (sockdgram)
> +			sounlock(so);
> +
>  		return (0);
>  	}
>  	if (timerisset(&so->so_idletv))
>  		timeout_add_tv(&so->so_idleto, &so->so_idletv);
>  	return (1);
>  }
> -
>  #endif /* SOCKET_SPLICE */
>  
>  void
> @@ -1839,22 +1971,16 @@ sorwakeup(struct socket *so)
>  		soassertlocked_readonly(so);
>  
>  #ifdef SOCKET_SPLICE
> -	if (so->so_rcv.sb_flags & SB_SPLICE) {
> -		/*
> -		 * TCP has a sendbuffer that can handle multiple packets
> -		 * at once.  So queue the stream a bit to accumulate data.
> -		 * The sosplice thread will call somove() later and send
> -		 * the packets calling tcp_output() only once.
> -		 * In the UDP case, send out the packets immediately.
> -		 * Using a thread would make things slower.
> -		 */
> -		if (so->so_proto->pr_flags & PR_WANTRCVD)
> +	if (so->so_proto->pr_flags & PR_SPLICE) {
> +		sb_mtx_lock(&so->so_rcv);
> +		if (so->so_rcv.sb_flags & SB_SPLICE)
>  			task_add(sosplice_taskq, &so->so_splicetask);
> -		else
> -			somove(so, M_DONTWAIT);
> +		if (isspliced(so)) {
> +			sb_mtx_unlock(&so->so_rcv);
> +			return;
> +		}
> +		sb_mtx_unlock(&so->so_rcv);
>  	}
> -	if (isspliced(so))
> -		return;
>  #endif
>  	sowakeup(so, &so->so_rcv);
>  	if (so->so_upcall)
> @@ -1868,10 +1994,17 @@ sowwakeup(struct socket *so)
>  		soassertlocked_readonly(so);
>  
>  #ifdef SOCKET_SPLICE
> -	if (so->so_snd.sb_flags & SB_SPLICE)
> -		task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
> -	if (issplicedback(so))
> -		return;
> +	if (so->so_proto->pr_flags & PR_SPLICE) {
> +		sb_mtx_lock(&so->so_snd);
> +		if (so->so_snd.sb_flags & SB_SPLICE)
> +			task_add(sosplice_taskq,
> +			    &so->so_sp->ssp_soback->so_splicetask);
> +		if (issplicedback(so)) {
> +			sb_mtx_unlock(&so->so_snd);
> +			return;
> +		}

I think the issplicedback(so) was outside of the if (so->so_proto->pr_flags
& PR_SPLICE) deliberately.

If somove() during soplice() sleeps, issplicedback(so) is set, but
(so->so_proto->pr_flags & PR_SPLICE) is not.  We do not schedule a
task as somove() will do its job after sleep.  But we do not want
to wakeup the socket for writing.

> +		sb_mtx_unlock(&so->so_snd);
> +	}
>  #endif
>  	sowakeup(so, &so->so_snd);
>  }
> Index: sys/netinet/udp_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v
> diff -u -p -r1.322 udp_usrreq.c
> --- sys/netinet/udp_usrreq.c	19 Jul 2024 15:41:58 -0000	1.322
> +++ sys/netinet/udp_usrreq.c	19 Jul 2024 18:42:26 -0000
> @@ -1209,6 +1209,11 @@ udp_send(struct socket *so, struct mbuf 
>  
>  	soassertlocked_readonly(so);
>  
> +	if (inp == NULL) {
> +		/* PCB could be destroyed, but socket still spliced. */
> +		return (EINVAL);
> +	}
> +
>  #ifdef PIPEX
>  	if (inp->inp_pipex) {
>  		struct pipex_session *session;
> Index: sys/sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> diff -u -p -r1.132 socketvar.h
> --- sys/sys/socketvar.h	12 Jul 2024 17:20:18 -0000	1.132
> +++ sys/sys/socketvar.h	19 Jul 2024 18:42:26 -0000
> @@ -330,6 +330,12 @@ int sblock(struct sockbuf *, int);
>  /* release lock on sockbuf sb */
>  void sbunlock(struct sockbuf *);
>  
> +static inline void
> +sbassertlocked(struct sockbuf *sb)
> +{
> +	rw_assert_wrlock(&sb->sb_lock);
> +}
> +
>  #define	SB_EMPTY_FIXUP(sb) do {						\
>  	if ((sb)->sb_mb == NULL) {					\
>  		(sb)->sb_mbtail = NULL;					\

You have 3 trailing whitespaces in the diff.

bluhm