From: Alexander Bluhm <bluhm@openbsd.org>
Subject: Re: Don't take solock() in soreceive() for SOCK_RAW inet sockets
To: Vitaliy Makkoveev <mvs@openbsd.org>
Cc: tech@openbsd.org
Date: Mon, 15 Apr 2024 20:52:44 +0200

On Thu, Apr 11, 2024 at 10:15:43PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Apr 11, 2024 at 03:07:55PM +0200, Alexander Bluhm wrote:
> > On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote:
> > > Really, this big diff will be the hellish one to commit and receive
> > > possible fallout due to very huge area.
> > 
> > Better small progress than huge commits.
> > 
> > > I see raw sockets the best way to start, because they don't need to call
> > 
> > Advantage to proceed with raw sockets is that they are simple and
> > have no socket splicing.  But the test coverage is low.
> > 
> > > The tcp and udp sockets require
> > > to start sosplice() and somove() reworking, so I want to do this the
> > > last.
> > 
> > To get rid of MP races we need parallel stress tests.  With UDP
> > this is easier than raw IP.  I while ago I used the diff below to
> > run UDP input with shared netlock.  But it does not work correctly
> > with somove().
> > 
> > To put parallel pressure onto the IP protocol and socket layer we
> > eihter have to make somove() MP safe for UDP, or adapt the diff
> > below for raw IP and write a bunch of tests.
> > 
> > With parallel UDP we also get a lot of testing by snapshot users.
> > 
> 
> This diff triggers soassertlocked() assertion in udp_input(). Seems it
> should be soassertlocked_readonly() because you don't need to grab
> `so_lock'. I tested this diff with this standalone sblock() for udp(4)
> sockets diff, looks stable.
> 
> The most problem for udp(4) sockets is splicing ability. However, we can
> hold `sb_mtx' around `ssp_socket' modifications together with solock().
> So the `sb_mtx' will be pretty enough to isspiced() check in
> soreceive(). The unlocked `so_sp' dereference is fine, because we set it
> only once for the whole socket life-time and we do this before
> `ssp_socket' assignment.
> 
> The sosplice() was reworked to accept standalone sblock() for udp(4)
> sockets.
> 
> We also need to take sblock() before splice sockets, so the sosplice()
> and soreceive() are both serialized. Since `sb_mtx' required to unsplice
> sockets too, it also serializes somove() with soreceive() regardless on
> somove() caller.
> 
> soreceive() performs unlocked `so_error' check and modification. I dont
> see problem here. Previously, you could no ability to predict which
> concurrent soreceive() or sosend() thread will fail and clean
> `so_error'. With this unlocked access we could have sosend() and
> soreceive() threads which fails together.
> 
> `so_error' stored to local `error2' variable because `so_error' could be
> overwritten by concurrent sosend() thread.

Passes regress with witness.

Running UDP in parallel on top still has assertion warnings.
I think is is a step into the right direction.

OK bluhm@

> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.329
> diff -u -p -r1.329 uipc_socket.c
> --- sys/kern/uipc_socket.c	11 Apr 2024 13:32:51 -0000	1.329
> +++ sys/kern/uipc_socket.c	11 Apr 2024 17:22:39 -0000
> @@ -159,8 +159,6 @@ soalloc(const struct protosw *prp, int w
>  	case AF_INET6:
>  		switch (prp->pr_type) {
>  		case SOCK_DGRAM:
> -			so->so_rcv.sb_flags |= SB_MTXLOCK;
> -			break;
>  		case SOCK_RAW:
>  			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
>  			break;
> @@ -819,7 +817,7 @@ soreceive(struct socket *so, struct mbuf
>  	struct mbuf *m, **mp;
>  	struct mbuf *cm;
>  	u_long len, offset, moff;
> -	int flags, error, type, uio_error = 0;
> +	int flags, error, error2, type, uio_error = 0;
>  	const struct protosw *pr = so->so_proto;
>  	struct mbuf *nextrecord;
>  	size_t resid, orig_resid = uio->uio_resid;
> @@ -889,10 +887,10 @@ restart:
>  			panic("receive 1: so %p, so_type %d, sb_cc %lu",
>  			    so, so->so_type, so->so_rcv.sb_cc);
>  #endif
> -		if (so->so_error) {
> +		if ((error2 = READ_ONCE(so->so_error))) {
>  			if (m)
>  				goto dontblock;
> -			error = so->so_error;
> +			error = error2;
>  			if ((flags & MSG_PEEK) == 0)
>  				so->so_error = 0;
>  			goto release;
> @@ -1289,7 +1287,13 @@ sorflush_locked(struct socket *so)
>  	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
>  	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
>  	KASSERT(error == 0);
> +
> +	if (sb->sb_flags & SB_OWNLOCK)
> +		solock(so);
>  	socantrcvmore(so);
> +	if (sb->sb_flags & SB_OWNLOCK)
> +		sounlock(so);
> +
>  	mtx_enter(&sb->sb_mtx);
>  	m = sb->sb_mb;
>  	memset(&sb->sb_startzero, 0,
> @@ -1323,13 +1327,17 @@ sorflush(struct socket *so)
>  int
>  sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
>  {
> -	struct file	*fp;
> +	struct file	*fp = NULL;
>  	struct socket	*sosp;
> -	struct sosplice	*sp;
>  	struct taskq	*tq;
>  	int		 error = 0;
>  
> -	soassertlocked(so);
> +	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
> +		return (EPROTONOSUPPORT);
> +	if (max && max < 0)
> +		return (EINVAL);
> +	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
> +		return (EINVAL);
>  
>  	if (sosplice_taskq == NULL) {
>  		rw_enter_write(&sosplice_lock);
> @@ -1350,63 +1358,51 @@ sosplice(struct socket *so, int fd, off_
>  		membar_consumer();
>  	}
>  
> -	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
> -		return (EPROTONOSUPPORT);
> -	if (so->so_options & SO_ACCEPTCONN)
> -		return (EOPNOTSUPP);
> +	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> +		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
> +			return (error);
> +		solock(so);
> +	} else {
> +		solock(so);
> +		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> +			sounlock(so);
> +			return (error);
> +		}
> +	}
> +
> +	if (so->so_options & SO_ACCEPTCONN) {
> +		error = EOPNOTSUPP;
> +		goto out;
> +	}
>  	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
> -	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
> -		return (ENOTCONN);
> -	if (so->so_sp == NULL) {
> -		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> -		if (so->so_sp == NULL)
> -			so->so_sp = sp;
> -		else
> -			pool_put(&sosplice_pool, sp);
> +	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
> +		error = ENOTCONN;
> +		goto out;
>  	}
> +	if (so->so_sp == NULL)
> +		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>  
>  	/* If no fd is given, unsplice by removing existing link. */
>  	if (fd < 0) {
> -		/* Lock receive buffer. */
> -		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> -			return (error);
> -		}
>  		if (so->so_sp->ssp_socket)
>  			sounsplice(so, so->so_sp->ssp_socket, 0);
> -		sbunlock(so, &so->so_rcv);
> -		return (0);
> +		goto out;
>  	}
>  
> -	if (max && max < 0)
> -		return (EINVAL);
> -
> -	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
> -		return (EINVAL);
> -
>  	/* Find sosp, the drain socket where data will be spliced into. */
>  	if ((error = getsock(curproc, fd, &fp)) != 0)
> -		return (error);
> +		goto out;
>  	sosp = fp->f_data;
>  	if (sosp->so_proto->pr_usrreqs->pru_send !=
>  	    so->so_proto->pr_usrreqs->pru_send) {
>  		error = EPROTONOSUPPORT;
> -		goto frele;
> -	}
> -	if (sosp->so_sp == NULL) {
> -		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
> -		if (sosp->so_sp == NULL)
> -			sosp->so_sp = sp;
> -		else
> -			pool_put(&sosplice_pool, sp);
> +		goto out;
>  	}
> +	if (sosp->so_sp == NULL)
> +		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>  
> -	/* Lock both receive and send buffer. */
> -	if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
> -		goto frele;
> -	}
>  	if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) {
> -		sbunlock(so, &so->so_rcv);
> -		goto frele;
> +		goto out;
>  	}
>  
>  	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
> @@ -1423,8 +1419,10 @@ sosplice(struct socket *so, int fd, off_
>  	}
>  
>  	/* Splice so and sosp together. */
> +	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_sp->ssp_socket = sosp;
>  	sosp->so_sp->ssp_soback = so;
> +	mtx_leave(&so->so_rcv.sb_mtx);
>  	so->so_splicelen = 0;
>  	so->so_splicemax = max;
>  	if (tv)
> @@ -1447,17 +1445,18 @@ sosplice(struct socket *so, int fd, off_
>  
>   release:
>  	sbunlock(sosp, &sosp->so_snd);
> -	sbunlock(so, &so->so_rcv);
> - frele:
> -	/*
> -	 * FRELE() must not be called with the socket lock held. It is safe to
> -	 * release the lock here as long as no other operation happen on the
> -	 * socket when sosplice() returns. The dance could be avoided by
> -	 * grabbing the socket lock inside this function.
> -	 */
> -	sounlock(so);
> -	FRELE(fp, curproc);
> -	solock(so);
> + out:
> +	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> +		sounlock(so);
> +		sbunlock(so, &so->so_rcv);
> +	} else {
> +		sbunlock(so, &so->so_rcv);
> +		sounlock(so);
> +	}
> +
> +	if (fp)
> +		FRELE(fp, curproc);
> +
>  	return (error);
>  }
>  
> @@ -1469,10 +1468,12 @@ sounsplice(struct socket *so, struct soc
>  	task_del(sosplice_taskq, &so->so_splicetask);
>  	timeout_del(&so->so_idleto);
>  	sosp->so_snd.sb_flags &= ~SB_SPLICE;
> +
>  	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_rcv.sb_flags &= ~SB_SPLICE;
> -	mtx_leave(&so->so_rcv.sb_mtx);
>  	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
> +	mtx_leave(&so->so_rcv.sb_mtx);
> +
>  	/* Do not wakeup a socket that is about to be freed. */
>  	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
>  		sorwakeup(so);
> @@ -2025,7 +2026,6 @@ sosetopt(struct socket *so, int level, i
>  			break;
>  #ifdef SOCKET_SPLICE
>  		case SO_SPLICE:
> -			solock(so);
>  			if (m == NULL) {
>  				error = sosplice(so, -1, 0, NULL);
>  			} else if (m->m_len < sizeof(int)) {
> @@ -2038,7 +2038,6 @@ sosetopt(struct socket *so, int level, i
>  				    mtod(m, struct splice *)->sp_max,
>  				   &mtod(m, struct splice *)->sp_idle);
>  			}
> -			sounlock(so);
>  			break;
>  #endif /* SOCKET_SPLICE */
>