Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: Use foffset() to push the KERNEL_LOCK()
To:
Martin Pieuchot <mpi@grenadille.net>
Cc:
mvs@openbsd.org, anton@openbsd.org, visa@openbsd.org, tech@openbsd.org
Date:
Sat, 01 Nov 2025 17:03:44 +0100

Download raw body.

Thread
> Date: Sat, 1 Nov 2025 13:45:41 +0100
> From: Martin Pieuchot <mpi@grenadille.net>
> 
> On 31/10/25(Fri) 16:08, Vitaliy Makkoveev wrote:
> > On Mon, Oct 27, 2025 at 10:08:08AM +0100, Martin Pieuchot wrote:
> > > Diff below uses foffset() in vn_read() and getdents(2) to push the
> > > KERNEL_LOCK() a bit further.  This is not much in itself but is a
> > > requirement for upcoming changes.
> > > 
> > > By reading the current offset before grabbing the vnode lock we
> > > accept a race: another thread could change it.
> > > 
> > > While here, grab the corresponding mutex before writing `f_offset' in
> > > dt(4).
> > > 
> > > ok?
> > > 
> > 
> > As I understand the code, vn_lock() and VOP_LOCK() paths do not require
> > kernel lock to be held, so you can leave vn_lock() as is and avoid
> > race for `f_offset'.
> 
> You're right. That said I'm interested in doing exactly the opposite of
> what you suggest.  That's why I sent this diff in the first place.
> 
> The race you mentioned is not a problem for the kernel to solve.  Is
> that clear or does that need explanations?

I think an explanation is needed.  POSIX does make certain guarantees
about atomicity of file operations (See 2.9.7 Thread Interactions with
File Operations in the System Interfaces part).  And while
posix_getdents() isn't listed there, read() is.

> > > Index: kern/syscalls.master
> > > ===================================================================
> > > RCS file: /cvs/src/sys/kern/syscalls.master,v
> > > diff -u -p -r1.270 syscalls.master
> > > --- kern/syscalls.master	24 May 2025 06:49:16 -0000	1.270
> > > +++ kern/syscalls.master	27 Oct 2025 08:49:58 -0000
> > > @@ -213,7 +213,7 @@
> > >  97	STD NOLOCK	{ int sys_socket(int domain, int type, int protocol); }
> > >  98	STD NOLOCK	{ int sys_connect(int s, const struct sockaddr *name, \
> > >  			    socklen_t namelen); }
> > > -99	STD		{ int sys_getdents(int fd, void *buf, size_t buflen); }
> > > +99	STD NOLOCK	{ int sys_getdents(int fd, void *buf, size_t buflen); }
> > >  100	STD		{ int sys_getpriority(int which, id_t who); }
> > >  101	STD NOLOCK	{ int sys_pipe2(int *fdp, int flags); }
> > >  102	STD NOLOCK	{ int sys_dup3(int from, int to, int flags); }
> > > Index: kern/vfs_syscalls.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v
> > > diff -u -p -r1.378 vfs_syscalls.c
> > > --- kern/vfs_syscalls.c	20 Sep 2025 13:53:36 -0000	1.378
> > > +++ kern/vfs_syscalls.c	27 Oct 2025 08:51:30 -0000
> > > @@ -3146,6 +3146,7 @@ sys_getdents(struct proc *p, void *v, re
> > >  	struct iovec aiov;
> > >  	size_t buflen;
> > >  	int error, eofflag;
> > > +	off_t offset;
> > >  
> > >  	buflen = SCARG(uap, buflen);
> > >  
> > > @@ -3163,10 +3164,8 @@ sys_getdents(struct proc *p, void *v, re
> > >  		goto bad;
> > >  	}
> > >  
> > > -	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> > > -
> > > -	if (fp->f_offset < 0) {
> > > -		VOP_UNLOCK(vp);
> > > +	offset = foffset(fp);
> > > +	if (offset < 0) {
> > >  		error = EINVAL;
> > >  		goto bad;
> > >  	}
> > > @@ -3179,12 +3178,15 @@ sys_getdents(struct proc *p, void *v, re
> > >  	auio.uio_segflg = UIO_USERSPACE;
> > >  	auio.uio_procp = p;
> > >  	auio.uio_resid = buflen;
> > > -	auio.uio_offset = fp->f_offset;
> > > +	auio.uio_offset = offset;
> > > +	KERNEL_LOCK();
> > > +	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> > >  	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag);
> > >  	mtx_enter(&fp->f_mtx);
> > >  	fp->f_offset = auio.uio_offset;
> > >  	mtx_leave(&fp->f_mtx);
> > >  	VOP_UNLOCK(vp);
> > > +	KERNEL_UNLOCK();
> > >  	if (error)
> > >  		goto bad;
> > >  	*retval = buflen - auio.uio_resid;
> > > Index: kern/vfs_vnops.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/kern/vfs_vnops.c,v
> > > diff -u -p -r1.125 vfs_vnops.c
> > > --- kern/vfs_vnops.c	6 Jan 2025 08:57:23 -0000	1.125
> > > +++ kern/vfs_vnops.c	27 Oct 2025 08:47:55 -0000
> > > @@ -346,26 +346,20 @@ vn_read(struct file *fp, struct uio *uio
> > >  	off_t offset;
> > >  	int error;
> > >  
> > > -	KERNEL_LOCK();
> > > -
> > > -	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> > > -
> > >  	if ((fflags & FO_POSITION) == 0)
> > > -		offset = uio->uio_offset = fp->f_offset;
> > > +		offset = uio->uio_offset = foffset(fp);
> > >  	else
> > >  		offset = uio->uio_offset;
> > >  
> > >  	/* no wrap around of offsets except on character devices */
> > > -	if (vp->v_type != VCHR && count > LLONG_MAX - offset) {
> > > -		error = EINVAL;
> > > -		goto done;
> > > -	}
> > > +	if (vp->v_type != VCHR && count > LLONG_MAX - offset)
> > > +		return (EINVAL);
> > >  
> > > -	if (vp->v_type == VDIR) {
> > > -		error = EISDIR;
> > > -		goto done;
> > > -	}
> > > +	if (vp->v_type == VDIR)
> > > +		return (EISDIR);
> > >  
> > > +	KERNEL_LOCK();
> > > +	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> > >  	error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
> > >  	    cred);
> > >  	if ((fflags & FO_POSITION) == 0) {
> > > @@ -373,7 +367,6 @@ vn_read(struct file *fp, struct uio *uio
> > >  		fp->f_offset += count - uio->uio_resid;
> > >  		mtx_leave(&fp->f_mtx);
> > >  	}
> > > -done:
> > >  	VOP_UNLOCK(vp);
> > >  	KERNEL_UNLOCK();
> > >  	return (error);
> > > Index: dev/dt/dt_dev.c
> > > ===================================================================
> > > RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
> > > diff -u -p -r1.46 dt_dev.c
> > > --- dev/dt/dt_dev.c	22 Sep 2025 07:49:43 -0000	1.46
> > > +++ dev/dt/dt_dev.c	27 Oct 2025 08:43:27 -0000
> > > @@ -711,7 +711,9 @@ dt_ioctl_rd_vnode(struct dt_softc *sc, s
> > >  			fp->f_type = DTYPE_VNODE;
> > >  			fp->f_ops = &vnops;
> > >  			fp->f_data = vn;
> > > +			mtx_enter(&fp->f_mtx);
> > >  			fp->f_offset = 0;
> > > +			mtx_leave(&fp->f_mtx);
> > >  			dtrv->dtrv_fd = fd;
> > >  			fdplock(p->p_fd);
> > >  			fdinsert(p->p_fd, fd, UF_EXCLOSE, fp);
> > > 
> > > 
> > 
> 
> 
>