Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Don't take solock() in soreceive() for SOCK_RAW inet sockets
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Wed, 3 Apr 2024 21:57:09 +0200

Download raw body.

Thread
On Sun, Mar 31, 2024 at 11:53:02PM +0300, Vitaliy Makkoveev wrote:
> Reduce netlock pressure for inet sockets.
> 
> These sockets are not connection oriented, they don't call pru_rcvd(),
> they can't be spliced, they don't set `so_error'. Nothing to protect
> with solock() in soreceive(), filt_soread() and filt_soexcept() paths.
> 
> `so_rcv' buffer protected by `sb_mtx' mutex(9), but since it released in
> soreceive() path, sblock() required to serialize concurrent soreceive()
> and sorflush() threads. Current sblock() is some kind of rwlock(9)
> implementation, so introduce `sb_lock' and it directly for that purpose.
> 
> The sorflush() and callers were refactored to avoid solock() for raw
> inet sockets. This done to avoid packet processing stop.
> 
> The new SB_STANDALONE flag is partially redundant WITH SB_OWNLOCK, but
> they will be merged in one flag later.

Passed regress with witness.  But it found splassert:

splassert: soassertlocked: want 1 have 0
Starting stack trace...
soassertlocked(1) at soassertlocked+0xc7
splassert: soassertlocked: want 1 have 0
Parallel traceback, suppressed...
soassertlocked(d782f188) at soassertlocked+0xc7
sbreserve(d782f188,d782f270,10000) at sbreserve+0x19
sosetopt(d782f188,ffff,1001,dabe6400) at sosetopt+0x295
sys_setsockopt(d7709cb8,f5fbf850,f5fbf848) at sys_setsockopt+0x126
syscall(f5fbf890) at syscall+0x41d
Xsyscall_untramp() at Xsyscall_untramp+0xa9

I am not a big fan of another SB_ flag.  It is hard enough to
understand the others.

I have not done a propper review yet.

bluhm

> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.325
> diff -u -p -r1.325 uipc_socket.c
> --- sys/kern/uipc_socket.c	31 Mar 2024 14:01:28 -0000	1.325
> +++ sys/kern/uipc_socket.c	31 Mar 2024 20:30:10 -0000
> @@ -142,6 +142,8 @@ soalloc(const struct protosw *prp, int w
>  		return (NULL);
>  	rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
>  	refcnt_init(&so->so_refcnt);
> +	rw_init(&so->so_rcv.sb_lock, "sbufrcv");
> +	rw_init(&so->so_snd.sb_lock, "sbufsnd");
>  	mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
>  	mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
>  	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
> @@ -155,10 +157,10 @@ soalloc(const struct protosw *prp, int w
>  	case AF_INET6:
>  		switch (prp->pr_type) {
>  		case SOCK_DGRAM:
> -			so->so_rcv.sb_flags |= SB_OWNLOCK;
> -			/* FALLTHROUGH */
> +			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK;
> +			break;
>  		case SOCK_RAW:
> -			so->so_rcv.sb_flags |= SB_MTXLOCK;
> +			so->so_rcv.sb_flags |= SB_MTXLOCK | SB_STANDALONE;
>  			break;
>  		}
>  		break;
> @@ -345,9 +347,22 @@ sofree(struct socket *so, int keep_lock)
>  	}
>  #endif /* SOCKET_SPLICE */
>  	sbrelease(so, &so->so_snd);
> -	sorflush(so);
> -	if (!keep_lock)
> +
> +	/*
> +	 * Regardless on '_locked' postfix, must release solock() before
> +	 * call sorflush_locked() for SB_STANDALONE marked socket. Can't
> +	 * release solock() and call sorflush() because solock() release
> +	 * is unwanted for tcp(4) socket. 
> +	 */
> +
> +	if (so->so_rcv.sb_flags & SB_STANDALONE)
>  		sounlock(so);
> +
> +	sorflush_locked(so);
> +
> +	if (!((so->so_rcv.sb_flags & SB_STANDALONE) || keep_lock))
> +		sounlock(so);
> +
>  #ifdef SOCKET_SPLICE
>  	if (so->so_sp) {
>  		/* Reuse splice idle, sounsplice() has been called before. */
> @@ -815,6 +830,7 @@ soreceive(struct socket *so, struct mbuf
>  	const struct protosw *pr = so->so_proto;
>  	struct mbuf *nextrecord;
>  	size_t resid, orig_resid = uio->uio_resid;
> +	int dosolock = ((so->so_rcv.sb_flags & SB_STANDALONE) == 0);
>  
>  	mp = mp0;
>  	if (paddr)
> @@ -844,12 +860,11 @@ bad:
>  	if (mp)
>  		*mp = NULL;
>  
> -	solock_shared(so);
> +	if (dosolock)
> +		solock_shared(so);
>  restart:
> -	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
> -		sounlock_shared(so);
> -		return (error);
> -	}
> +	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0)
> +		goto out;
>  	sb_mtx_lock(&so->so_rcv);
>  
>  	m = so->so_rcv.sb_mb;
> @@ -914,14 +929,16 @@ restart:
>  		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
>  		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
>  
> -		if (so->so_rcv.sb_flags & SB_OWNLOCK) {
> +		if (so->so_rcv.sb_flags & (SB_OWNLOCK | SB_STANDALONE)) {
>  			sbunlock_locked(so, &so->so_rcv);
> -			sounlock_shared(so);
> +			if (dosolock)
> +				sounlock_shared(so);
>  			error = sbwait_locked(so, &so->so_rcv);
>  			sb_mtx_unlock(&so->so_rcv);
>  			if (error)
>  				return (error);
> -			solock_shared(so);
> +			if (dosolock)
> +				solock_shared(so);
>  		} else {
>  			sb_mtx_unlock(&so->so_rcv);
>  			sbunlock(so, &so->so_rcv);
> @@ -998,11 +1015,13 @@ dontblock:
>  			if (controlp) {
>  				if (pr->pr_domain->dom_externalize) {
>  					sb_mtx_unlock(&so->so_rcv);
> -					sounlock_shared(so);
> +					if (dosolock)
> +						sounlock_shared(so);
>  					error =
>  					    (*pr->pr_domain->dom_externalize)
>  					    (cm, controllen, flags);
> -					solock_shared(so);
> +					if (dosolock)
> +						solock_shared(so);
>  					sb_mtx_lock(&so->so_rcv);
>  				}
>  				*controlp = cm;
> @@ -1081,9 +1100,11 @@ dontblock:
>  			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
>  			resid = uio->uio_resid;
>  			sb_mtx_unlock(&so->so_rcv);
> -			sounlock_shared(so);
> +			if (dosolock)
> +				sounlock_shared(so);
>  			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
> -			solock_shared(so);
> +			if (dosolock)
> +				solock_shared(so);
>  			sb_mtx_lock(&so->so_rcv);
>  			if (uio_error)
>  				uio->uio_resid = resid - len;
> @@ -1166,14 +1187,21 @@ dontblock:
>  				break;
>  			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
>  			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
> -			sb_mtx_unlock(&so->so_rcv);
> -			error = sbwait(so, &so->so_rcv);
> -			if (error) {
> -				sbunlock(so, &so->so_rcv);
> -				sounlock_shared(so);
> -				return (0);
> +			if (dosolock) {
> +				sb_mtx_unlock(&so->so_rcv);
> +				error = sbwait(so, &so->so_rcv);
> +				if (error) {
> +					sbunlock(so, &so->so_rcv);
> +					sounlock_shared(so);
> +					return (0);
> +				}
> +				sb_mtx_lock(&so->so_rcv);
> +			} else {
> +				if (sbwait_locked(so, &so->so_rcv)) {
> +					error = 0;
> +					goto release;
> +				}
>  			}
> -			sb_mtx_lock(&so->so_rcv);
>  			if ((m = so->so_rcv.sb_mb) != NULL)
>  				nextrecord = m->m_nextpkt;
>  		}
> @@ -1222,7 +1250,9 @@ dontblock:
>  release:
>  	sb_mtx_unlock(&so->so_rcv);
>  	sbunlock(so, &so->so_rcv);
> -	sounlock_shared(so);
> +out:
> +	if (dosolock)
> +		sounlock_shared(so);
>  	return (error);
>  }
>  
> @@ -1231,7 +1261,6 @@ soshutdown(struct socket *so, int how)
>  {
>  	int error = 0;
>  
> -	solock(so);
>  	switch (how) {
>  	case SHUT_RD:
>  		sorflush(so);
> @@ -1240,25 +1269,29 @@ soshutdown(struct socket *so, int how)
>  		sorflush(so);
>  		/* FALLTHROUGH */
>  	case SHUT_WR:
> +		solock(so);
>  		error = pru_shutdown(so);
> +		sounlock(so);
>  		break;
>  	default:
>  		error = EINVAL;
>  		break;
>  	}
> -	sounlock(so);
>  
>  	return (error);
>  }
>  
>  void
> -sorflush(struct socket *so)
> +sorflush_locked(struct socket *so)
>  {
>  	struct sockbuf *sb = &so->so_rcv;
>  	struct mbuf *m;
>  	const struct protosw *pr = so->so_proto;
>  	int error;
>  
> +	if ((sb->sb_flags & SB_STANDALONE) == 0)
> +		soassertlocked(so);
> +
>  	error = sblock(so, sb, SBL_WAIT | SBL_NOINTR);
>  	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
>  	KASSERT(error == 0);
> @@ -1275,6 +1308,16 @@ sorflush(struct socket *so)
>  	m_purge(m);
>  }
>  
> +void
> +sorflush(struct socket *so)
> +{
> +	if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
> +		solock(so);
> +	sorflush_locked(so);
> +	if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
> +		sounlock(so);
> +}
> +
>  #ifdef SOCKET_SPLICE
>  
>  #define so_splicelen	so_sp->ssp_len
> @@ -1913,7 +1956,8 @@ sosetopt(struct socket *so, int level, i
>  			if ((long)cnt <= 0)
>  				cnt = 1;
>  
> -			solock(so);
> +			if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
> +				solock(so);
>  			mtx_enter(&sb->sb_mtx);
>  
>  			switch (optname) {
> @@ -1939,7 +1983,8 @@ sosetopt(struct socket *so, int level, i
>  			}
>  
>  			mtx_leave(&sb->sb_mtx);
> -			sounlock(so);
> +			if (((so->so_rcv.sb_flags & SB_STANDALONE) == 0))
> +				sounlock(so);
>  
>  			break;
>  		    }
> Index: sys/kern/uipc_socket2.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 uipc_socket2.c
> --- sys/kern/uipc_socket2.c	31 Mar 2024 13:50:00 -0000	1.147
> +++ sys/kern/uipc_socket2.c	31 Mar 2024 20:30:10 -0000
> @@ -350,7 +350,9 @@ socantsendmore(struct socket *so)
>  void
>  socantrcvmore(struct socket *so)
>  {
> -	soassertlocked(so);
> +	if ((so->so_rcv.sb_flags & SB_STANDALONE) == 0)
> +		soassertlocked(so);
> +
>  	mtx_enter(&so->so_rcv.sb_mtx);
>  	so->so_rcv.sb_state |= SS_CANTRCVMORE;
>  	mtx_leave(&so->so_rcv.sb_mtx);
> @@ -557,6 +559,17 @@ sblock(struct socket *so, struct sockbuf
>  {
>  	int error = 0, prio = PSOCK;
>  
> +	if (sb->sb_flags & SB_STANDALONE) {
> +		int rwflags = RW_WRITE;
> +
> +		if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
> +			rwflags |= RW_INTR;
> +		if (!(flags & SBL_WAIT))
> +			rwflags |= RW_NOSLEEP;
> +
> +		return rw_enter(&sb->sb_lock, rwflags);
> +	}
> +
>  	soassertlocked(so);
>  
>  	mtx_enter(&sb->sb_mtx);
> @@ -589,6 +602,11 @@ out:
>  void
>  sbunlock_locked(struct socket *so, struct sockbuf *sb)
>  {
> +	if (sb->sb_flags & SB_STANDALONE) {
> +		rw_exit(&sb->sb_lock);
> +		return;
> +	}
> +
>  	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
>  
>  	sb->sb_flags &= ~SB_LOCK;
> @@ -601,6 +619,11 @@ sbunlock_locked(struct socket *so, struc
>  void
>  sbunlock(struct socket *so, struct sockbuf *sb)
>  {
> +	if (sb->sb_flags & SB_STANDALONE) {
> +		rw_exit(&sb->sb_lock);
> +		return;
> +	}
> +
>  	mtx_enter(&sb->sb_mtx);
>  	sbunlock_locked(so, sb);
>  	mtx_leave(&sb->sb_mtx);
> Index: sys/kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.203
> diff -u -p -r1.203 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c	26 Mar 2024 09:46:47 -0000	1.203
> +++ sys/kern/uipc_usrreq.c	31 Mar 2024 20:30:10 -0000
> @@ -1450,9 +1450,7 @@ unp_gc(void *arg __unused)
>  				 * thread.
>  				 */
>  				so = unp->unp_socket;
> -				solock(so);
>  				sorflush(so);
> -				sounlock(so);
>  			}
>  		}
>  	}
> Index: sys/sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.127
> diff -u -p -r1.127 socketvar.h
> --- sys/sys/socketvar.h	27 Mar 2024 22:47:53 -0000	1.127
> +++ sys/sys/socketvar.h	31 Mar 2024 20:30:10 -0000
> @@ -106,7 +106,8 @@ struct socket {
>   * Variables for socket buffering.
>   */
>  	struct	sockbuf {
> -		struct mutex sb_mtx;
> +		struct rwlock sb_lock; 
> +		struct mutex  sb_mtx;
>  /* The following fields are all zeroed on flush. */
>  #define	sb_startzero	sb_cc
>  		u_long	sb_cc;		/* actual chars in buffer */
> @@ -136,6 +137,7 @@ struct socket {
>  #define SB_NOINTR	0x0040		/* operations not interruptible */
>  #define SB_MTXLOCK	0x0080		/* use sb_mtx for sockbuf protection */
>  #define SB_OWNLOCK	0x0100		/* sb_mtx used standalone */
> +#define SB_STANDALONE	0x0200		/* standalone sblock() */
>  
>  	void	(*so_upcall)(struct socket *so, caddr_t arg, int waitf);
>  	caddr_t	so_upcallarg;		/* Arg for above */
> @@ -401,6 +403,7 @@ int	sosend(struct socket *, struct mbuf 
>  int	sosetopt(struct socket *, int, int, struct mbuf *);
>  int	soshutdown(struct socket *, int);
>  void	sorflush(struct socket *);
> +void	sorflush_locked(struct socket *);
>  void	sowakeup(struct socket *, struct sockbuf *);
>  void	sorwakeup(struct socket *);
>  void	sowwakeup(struct socket *);