Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: Use foffset() to push the KERNEL_LOCK()
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
anton@openbsd.org, visa@openbsd.org, tech@openbsd.org
Date:
Sat, 15 Nov 2025 09:27:47 +0100

Download raw body.

Thread
> Date: Fri, 14 Nov 2025 21:58:05 +0300
> From: Vitaliy Makkoveev <mvs@openbsd.org>
> 
> On Thu, Nov 13, 2025 at 09:38:53AM +0000, Martin Pieuchot wrote:
> > On 12/11/25(Wed) 22:05, Vitaliy Makkoveev wrote:
> > > [...] 
> > > I like the idea to push kernel lock deeper and keep it around
> > > VOP_READ()/VOP_WRITE(). We don't need to move vn_lock()/VOP_LOCK(), so
> > > the race for `f_offset' will be excluded.
> > > 
> > > I can make this diff by myself if mpi@ has no interest in this
> > > direction.
> > 
> > Sure, I don't mind.
> > 
> > Note that there are currently two independent  bottlenecks when it comes
> > to FS syscalls.  One is related to VOP_READ() and VOP_WRITE() and I have
> > a plan to address it.
> > 
> > The other is related to namei(9) (doopenat(), dofstatat(), and
> > sys___realpath()) and will certainly reduce the KERNREL_LOCK()
> > contention with fewer efforts.
> > 
> > So if you're interested in reducing the KERNEL_LOCK() contention in this
> > area, may I suggest you look at vn_open() and all others parts of the
> > kernel namei(9) is not yet surrounded by a KERNEL_LOCK() dance?  Then we
> > can start pushing it further into namei(9).
> > 
> > Thanks,
> > Martin
> > 
> > 
> 
> This diff is based on mpi@'s, but lefts vn_lock() untouched. So we
> have races `f_offset'.

Sorry, but that sentence makes no sense.

> Index: sys/dev/dt/dt_dev.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
> retrieving revision 1.46
> diff -u -p -r1.46 dt_dev.c
> --- sys/dev/dt/dt_dev.c	22 Sep 2025 07:49:43 -0000	1.46
> +++ sys/dev/dt/dt_dev.c	14 Nov 2025 18:51:57 -0000
> @@ -711,7 +711,9 @@ dt_ioctl_rd_vnode(struct dt_softc *sc, s
>  			fp->f_type = DTYPE_VNODE;
>  			fp->f_ops = &vnops;
>  			fp->f_data = vn;
> +			mtx_enter(&fp->f_mtx);
>  			fp->f_offset = 0;
> +			mtx_leave(&fp->f_mtx);
>  			dtrv->dtrv_fd = fd;
>  			fdplock(p->p_fd);
>  			fdinsert(p->p_fd, fd, UF_EXCLOSE, fp);
> Index: sys/kern/syscalls.master
> ===================================================================
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.270
> diff -u -p -r1.270 syscalls.master
> --- sys/kern/syscalls.master	24 May 2025 06:49:16 -0000	1.270
> +++ sys/kern/syscalls.master	14 Nov 2025 18:52:00 -0000
> @@ -213,7 +213,7 @@
>  97	STD NOLOCK	{ int sys_socket(int domain, int type, int protocol); }
>  98	STD NOLOCK	{ int sys_connect(int s, const struct sockaddr *name, \
>  			    socklen_t namelen); }
> -99	STD		{ int sys_getdents(int fd, void *buf, size_t buflen); }
> +99	STD NOLOCK	{ int sys_getdents(int fd, void *buf, size_t buflen); }
>  100	STD		{ int sys_getpriority(int which, id_t who); }
>  101	STD NOLOCK	{ int sys_pipe2(int *fdp, int flags); }
>  102	STD NOLOCK	{ int sys_dup3(int from, int to, int flags); }
> Index: sys/kern/vfs_syscalls.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v
> retrieving revision 1.378
> diff -u -p -r1.378 vfs_syscalls.c
> --- sys/kern/vfs_syscalls.c	20 Sep 2025 13:53:36 -0000	1.378
> +++ sys/kern/vfs_syscalls.c	14 Nov 2025 18:52:02 -0000
> @@ -3146,6 +3146,7 @@ sys_getdents(struct proc *p, void *v, re
>  	struct iovec aiov;
>  	size_t buflen;
>  	int error, eofflag;
> +	off_t offset;
>  
>  	buflen = SCARG(uap, buflen);
>  
> @@ -3165,7 +3166,8 @@ sys_getdents(struct proc *p, void *v, re
>  
>  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
>  
> -	if (fp->f_offset < 0) {
> +	offset = foffset(fp);
> +	if (offset < 0) {
>  		VOP_UNLOCK(vp);
>  		error = EINVAL;
>  		goto bad;
> @@ -3179,8 +3181,10 @@ sys_getdents(struct proc *p, void *v, re
>  	auio.uio_segflg = UIO_USERSPACE;
>  	auio.uio_procp = p;
>  	auio.uio_resid = buflen;
> -	auio.uio_offset = fp->f_offset;
> +	auio.uio_offset = offset;
> +	KERNEL_LOCK();
>  	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag);
> +	KERNEL_UNLOCK();
>  	mtx_enter(&fp->f_mtx);
>  	fp->f_offset = auio.uio_offset;
>  	mtx_leave(&fp->f_mtx);
> Index: sys/kern/vfs_vnops.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_vnops.c,v
> retrieving revision 1.125
> diff -u -p -r1.125 vfs_vnops.c
> --- sys/kern/vfs_vnops.c	6 Jan 2025 08:57:23 -0000	1.125
> +++ sys/kern/vfs_vnops.c	14 Nov 2025 18:52:02 -0000
> @@ -346,12 +346,10 @@ vn_read(struct file *fp, struct uio *uio
>  	off_t offset;
>  	int error;
>  
> -	KERNEL_LOCK();
> -
>  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
>  
>  	if ((fflags & FO_POSITION) == 0)
> -		offset = uio->uio_offset = fp->f_offset;
> +		offset = uio->uio_offset = foffset(fp);
>  	else
>  		offset = uio->uio_offset;
>  
> @@ -366,8 +364,10 @@ vn_read(struct file *fp, struct uio *uio
>  		goto done;
>  	}
>  
> +	KERNEL_LOCK();
>  	error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
>  	    cred);
> +	KERNEL_UNLOCK();
>  	if ((fflags & FO_POSITION) == 0) {
>  		mtx_enter(&fp->f_mtx);
>  		fp->f_offset += count - uio->uio_resid;
> @@ -375,7 +375,6 @@ vn_read(struct file *fp, struct uio *uio
>  	}
>  done:
>  	VOP_UNLOCK(vp);
> -	KERNEL_UNLOCK();
>  	return (error);
>  }
>  
>