Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Don't take solock() in soreceive() for SOCK_RAW inet sockets
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Mon, 15 Apr 2024 20:52:44 +0200

Download raw body.

Thread
  • Alexander Bluhm:

    Don't take solock() in soreceive() for SOCK_RAW inet sockets

  • On Thu, Apr 11, 2024 at 10:15:43PM +0300, Vitaliy Makkoveev wrote:
    > On Thu, Apr 11, 2024 at 03:07:55PM +0200, Alexander Bluhm wrote:
    > > On Wed, Apr 10, 2024 at 10:29:57PM +0300, Vitaliy Makkoveev wrote:
    > > > Really, this big diff will be the hellish one to commit and receive
    > > > possible fallout due to very huge area.
    > > 
    > > Better small progress than huge commits.
    > > 
    > > > I see raw sockets the best way to start, because they don't need to call
    > > 
    > > Advantage to proceed with raw sockets is that they are simple and
    > > have no socket splicing.  But the test coverage is low.
    > > 
    > > > The tcp and udp sockets require
    > > > to start sosplice() and somove() reworking, so I want to do this the
    > > > last.
    > > 
    > > To get rid of MP races we need parallel stress tests.  With UDP
    > > this is easier than raw IP.  I while ago I used the diff below to
    > > run UDP input with shared netlock.  But it does not work correctly
    > > with somove().
    > > 
    > > To put parallel pressure onto the IP protocol and socket layer we
    > > eihter have to make somove() MP safe for UDP, or adapt the diff
    > > below for raw IP and write a bunch of tests.
    > > 
    > > With parallel UDP we also get a lot of testing by snapshot users.
    > > 
    > 
    > This diff triggers soassertlocked() assertion in udp_input(). Seems it
    > should be soassertlocked_readonly() because you don't need to grab
    > `so_lock'. I tested this diff with this standalone sblock() for udp(4)
    > sockets diff, looks stable.
    > 
    > The most problem for udp(4) sockets is splicing ability. However, we can
    > hold `sb_mtx' around `ssp_socket' modifications together with solock().
    > So the `sb_mtx' will be pretty enough to isspiced() check in
    > soreceive(). The unlocked `so_sp' dereference is fine, because we set it
    > only once for the whole socket life-time and we do this before
    > `ssp_socket' assignment.
    > 
    > The sosplice() was reworked to accept standalone sblock() for udp(4)
    > sockets.
    > 
    > We also need to take sblock() before splice sockets, so the sosplice()
    > and soreceive() are both serialized. Since `sb_mtx' required to unsplice
    > sockets too, it also serializes somove() with soreceive() regardless on
    > somove() caller.
    > 
    > soreceive() performs unlocked `so_error' check and modification. I dont
    > see problem here. Previously, you could no ability to predict which
    > concurrent soreceive() or sosend() thread will fail and clean
    > `so_error'. With this unlocked access we could have sosend() and
    > soreceive() threads which fails together.
    > 
    > `so_error' stored to local `error2' variable because `so_error' could be
    > overwritten by concurrent sosend() thread.
    
    Passes regress with witness.
    
    Running UDP in parallel on top still has assertion warnings.
    I think is is a step into the right direction.
    
    OK bluhm@
    
    > Index: sys/kern/uipc_socket.c
    > ===================================================================
    > RCS file: /cvs/src/sys/kern/uipc_socket.c,v
    > retrieving revision 1.329
    > diff -u -p -r1.329 uipc_socket.c
    > --- sys/kern/uipc_socket.c	11 Apr 2024 13:32:51 -0000	1.329
    > +++ sys/kern/uipc_socket.c	11 Apr 2024 17:22:39 -0000
    > @@ -159,8 +159,6 @@ soalloc(const struct protosw *prp, int w
    >  	case AF_INET6:
    >  		switch (prp->pr_type) {
    >  		case SOCK_DGRAM:
    > -			so->so_rcv.sb_flags |= SB_MTXLOCK;
    > -			break;
    >  		case SOCK_RAW:
    >  			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
    >  			break;
    > @@ -819,7 +817,7 @@ soreceive(struct socket *so, struct mbuf
    >  	struct mbuf *m, **mp;
    >  	struct mbuf *cm;
    >  	u_long len, offset, moff;
    > -	int flags, error, type, uio_error = 0;
    > +	int flags, error, error2, type, uio_error = 0;
    >  	const struct protosw *pr = so->so_proto;
    >  	struct mbuf *nextrecord;
    >  	size_t resid, orig_resid = uio->uio_resid;
    > @@ -889,10 +887,10 @@ restart:
    >  			panic("receive 1: so %p, so_type %d, sb_cc %lu",
    >  			    so, so->so_type, so->so_rcv.sb_cc);
    >  #endif
    > -		if (so->so_error) {
    > +		if ((error2 = READ_ONCE(so->so_error))) {
    >  			if (m)
    >  				goto dontblock;
    > -			error = so->so_error;
    > +			error = error2;
    >  			if ((flags & MSG_PEEK) == 0)
    >  				so->so_error = 0;
    >  			goto release;
    > @@ -1289,7 +1287,13 @@ sorflush_locked(struct socket *so)
    >  	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
    >  	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
    >  	KASSERT(error == 0);
    > +
    > +	if (sb->sb_flags & SB_OWNLOCK)
    > +		solock(so);
    >  	socantrcvmore(so);
    > +	if (sb->sb_flags & SB_OWNLOCK)
    > +		sounlock(so);
    > +
    >  	mtx_enter(&sb->sb_mtx);
    >  	m = sb->sb_mb;
    >  	memset(&sb->sb_startzero, 0,
    > @@ -1323,13 +1327,17 @@ sorflush(struct socket *so)
    >  int
    >  sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
    >  {
    > -	struct file	*fp;
    > +	struct file	*fp = NULL;
    >  	struct socket	*sosp;
    > -	struct sosplice	*sp;
    >  	struct taskq	*tq;
    >  	int		 error = 0;
    >  
    > -	soassertlocked(so);
    > +	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
    > +		return (EPROTONOSUPPORT);
    > +	if (max && max < 0)
    > +		return (EINVAL);
    > +	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
    > +		return (EINVAL);
    >  
    >  	if (sosplice_taskq == NULL) {
    >  		rw_enter_write(&sosplice_lock);
    > @@ -1350,63 +1358,51 @@ sosplice(struct socket *so, int fd, off_
    >  		membar_consumer();
    >  	}
    >  
    > -	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
    > -		return (EPROTONOSUPPORT);
    > -	if (so->so_options & SO_ACCEPTCONN)
    > -		return (EOPNOTSUPP);
    > +	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
    > +		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0)
    > +			return (error);
    > +		solock(so);
    > +	} else {
    > +		solock(so);
    > +		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
    > +			sounlock(so);
    > +			return (error);
    > +		}
    > +	}
    > +
    > +	if (so->so_options & SO_ACCEPTCONN) {
    > +		error = EOPNOTSUPP;
    > +		goto out;
    > +	}
    >  	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
    > -	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
    > -		return (ENOTCONN);
    > -	if (so->so_sp == NULL) {
    > -		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
    > -		if (so->so_sp == NULL)
    > -			so->so_sp = sp;
    > -		else
    > -			pool_put(&sosplice_pool, sp);
    > +	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
    > +		error = ENOTCONN;
    > +		goto out;
    >  	}
    > +	if (so->so_sp == NULL)
    > +		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
    >  
    >  	/* If no fd is given, unsplice by removing existing link. */
    >  	if (fd < 0) {
    > -		/* Lock receive buffer. */
    > -		if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
    > -			return (error);
    > -		}
    >  		if (so->so_sp->ssp_socket)
    >  			sounsplice(so, so->so_sp->ssp_socket, 0);
    > -		sbunlock(so, &so->so_rcv);
    > -		return (0);
    > +		goto out;
    >  	}
    >  
    > -	if (max && max < 0)
    > -		return (EINVAL);
    > -
    > -	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
    > -		return (EINVAL);
    > -
    >  	/* Find sosp, the drain socket where data will be spliced into. */
    >  	if ((error = getsock(curproc, fd, &fp)) != 0)
    > -		return (error);
    > +		goto out;
    >  	sosp = fp->f_data;
    >  	if (sosp->so_proto->pr_usrreqs->pru_send !=
    >  	    so->so_proto->pr_usrreqs->pru_send) {
    >  		error = EPROTONOSUPPORT;
    > -		goto frele;
    > -	}
    > -	if (sosp->so_sp == NULL) {
    > -		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
    > -		if (sosp->so_sp == NULL)
    > -			sosp->so_sp = sp;
    > -		else
    > -			pool_put(&sosplice_pool, sp);
    > +		goto out;
    >  	}
    > +	if (sosp->so_sp == NULL)
    > +		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
    >  
    > -	/* Lock both receive and send buffer. */
    > -	if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) {
    > -		goto frele;
    > -	}
    >  	if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) {
    > -		sbunlock(so, &so->so_rcv);
    > -		goto frele;
    > +		goto out;
    >  	}
    >  
    >  	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
    > @@ -1423,8 +1419,10 @@ sosplice(struct socket *so, int fd, off_
    >  	}
    >  
    >  	/* Splice so and sosp together. */
    > +	mtx_enter(&so->so_rcv.sb_mtx);
    >  	so->so_sp->ssp_socket = sosp;
    >  	sosp->so_sp->ssp_soback = so;
    > +	mtx_leave(&so->so_rcv.sb_mtx);
    >  	so->so_splicelen = 0;
    >  	so->so_splicemax = max;
    >  	if (tv)
    > @@ -1447,17 +1445,18 @@ sosplice(struct socket *so, int fd, off_
    >  
    >   release:
    >  	sbunlock(sosp, &sosp->so_snd);
    > -	sbunlock(so, &so->so_rcv);
    > - frele:
    > -	/*
    > -	 * FRELE() must not be called with the socket lock held. It is safe to
    > -	 * release the lock here as long as no other operation happen on the
    > -	 * socket when sosplice() returns. The dance could be avoided by
    > -	 * grabbing the socket lock inside this function.
    > -	 */
    > -	sounlock(so);
    > -	FRELE(fp, curproc);
    > -	solock(so);
    > + out:
    > +	if (so->so_rcv.sb_flags & SB_OWNLOCK) {
    > +		sounlock(so);
    > +		sbunlock(so, &so->so_rcv);
    > +	} else {
    > +		sbunlock(so, &so->so_rcv);
    > +		sounlock(so);
    > +	}
    > +
    > +	if (fp)
    > +		FRELE(fp, curproc);
    > +
    >  	return (error);
    >  }
    >  
    > @@ -1469,10 +1468,12 @@ sounsplice(struct socket *so, struct soc
    >  	task_del(sosplice_taskq, &so->so_splicetask);
    >  	timeout_del(&so->so_idleto);
    >  	sosp->so_snd.sb_flags &= ~SB_SPLICE;
    > +
    >  	mtx_enter(&so->so_rcv.sb_mtx);
    >  	so->so_rcv.sb_flags &= ~SB_SPLICE;
    > -	mtx_leave(&so->so_rcv.sb_mtx);
    >  	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
    > +	mtx_leave(&so->so_rcv.sb_mtx);
    > +
    >  	/* Do not wakeup a socket that is about to be freed. */
    >  	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
    >  		sorwakeup(so);
    > @@ -2025,7 +2026,6 @@ sosetopt(struct socket *so, int level, i
    >  			break;
    >  #ifdef SOCKET_SPLICE
    >  		case SO_SPLICE:
    > -			solock(so);
    >  			if (m == NULL) {
    >  				error = sosplice(so, -1, 0, NULL);
    >  			} else if (m->m_len < sizeof(int)) {
    > @@ -2038,7 +2038,6 @@ sosetopt(struct socket *so, int level, i
    >  				    mtod(m, struct splice *)->sp_max,
    >  				   &mtod(m, struct splice *)->sp_idle);
    >  			}
    > -			sounlock(so);
    >  			break;
    >  #endif /* SOCKET_SPLICE */
    >  
    
    
    
  • Alexander Bluhm:

    Don't take solock() in soreceive() for SOCK_RAW inet sockets