Download raw body.
Use foffset() to push the KERNEL_LOCK()
On Thu, Nov 13, 2025 at 09:38:53AM +0000, Martin Pieuchot wrote:
> On 12/11/25(Wed) 22:05, Vitaliy Makkoveev wrote:
> > [...]
> > I like the idea to push kernel lock deeper and keep it around
> > VOP_READ()/VOP_WRITE(). We don't need to move vn_lock()/VOP_LOCK(), so
> > the race for `f_offset' will be excluded.
> >
> > I can make this diff by myself if mpi@ has no interest in this
> > direction.
>
> Sure, I don't mind.
>
> Note that there are currently two independent bottlenecks when it comes
> to FS syscalls. One is related to VOP_READ() and VOP_WRITE() and I have
> a plan to address it.
>
> The other is related to namei(9) (doopenat(), dofstatat(), and
> sys___realpath()) and will certainly reduce the KERNREL_LOCK()
> contention with fewer efforts.
>
> So if you're interested in reducing the KERNEL_LOCK() contention in this
> area, may I suggest you look at vn_open() and all others parts of the
> kernel namei(9) is not yet surrounded by a KERNEL_LOCK() dance? Then we
> can start pushing it further into namei(9).
>
> Thanks,
> Martin
>
>
This diff is based on mpi@'s, but lefts vn_lock() untouched. So we
have races `f_offset'.
Index: sys/dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.46
diff -u -p -r1.46 dt_dev.c
--- sys/dev/dt/dt_dev.c 22 Sep 2025 07:49:43 -0000 1.46
+++ sys/dev/dt/dt_dev.c 14 Nov 2025 18:51:57 -0000
@@ -711,7 +711,9 @@ dt_ioctl_rd_vnode(struct dt_softc *sc, s
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_data = vn;
+ mtx_enter(&fp->f_mtx);
fp->f_offset = 0;
+ mtx_leave(&fp->f_mtx);
dtrv->dtrv_fd = fd;
fdplock(p->p_fd);
fdinsert(p->p_fd, fd, UF_EXCLOSE, fp);
Index: sys/kern/syscalls.master
===================================================================
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.270
diff -u -p -r1.270 syscalls.master
--- sys/kern/syscalls.master 24 May 2025 06:49:16 -0000 1.270
+++ sys/kern/syscalls.master 14 Nov 2025 18:52:00 -0000
@@ -213,7 +213,7 @@
97 STD NOLOCK { int sys_socket(int domain, int type, int protocol); }
98 STD NOLOCK { int sys_connect(int s, const struct sockaddr *name, \
socklen_t namelen); }
-99 STD { int sys_getdents(int fd, void *buf, size_t buflen); }
+99 STD NOLOCK { int sys_getdents(int fd, void *buf, size_t buflen); }
100 STD { int sys_getpriority(int which, id_t who); }
101 STD NOLOCK { int sys_pipe2(int *fdp, int flags); }
102 STD NOLOCK { int sys_dup3(int from, int to, int flags); }
Index: sys/kern/vfs_syscalls.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.378
diff -u -p -r1.378 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c 20 Sep 2025 13:53:36 -0000 1.378
+++ sys/kern/vfs_syscalls.c 14 Nov 2025 18:52:02 -0000
@@ -3146,6 +3146,7 @@ sys_getdents(struct proc *p, void *v, re
struct iovec aiov;
size_t buflen;
int error, eofflag;
+ off_t offset;
buflen = SCARG(uap, buflen);
@@ -3165,7 +3166,8 @@ sys_getdents(struct proc *p, void *v, re
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- if (fp->f_offset < 0) {
+ offset = foffset(fp);
+ if (offset < 0) {
VOP_UNLOCK(vp);
error = EINVAL;
goto bad;
@@ -3179,8 +3181,10 @@ sys_getdents(struct proc *p, void *v, re
auio.uio_segflg = UIO_USERSPACE;
auio.uio_procp = p;
auio.uio_resid = buflen;
- auio.uio_offset = fp->f_offset;
+ auio.uio_offset = offset;
+ KERNEL_LOCK();
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag);
+ KERNEL_UNLOCK();
mtx_enter(&fp->f_mtx);
fp->f_offset = auio.uio_offset;
mtx_leave(&fp->f_mtx);
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.125
diff -u -p -r1.125 vfs_vnops.c
--- sys/kern/vfs_vnops.c 6 Jan 2025 08:57:23 -0000 1.125
+++ sys/kern/vfs_vnops.c 14 Nov 2025 18:52:02 -0000
@@ -346,12 +346,10 @@ vn_read(struct file *fp, struct uio *uio
off_t offset;
int error;
- KERNEL_LOCK();
-
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if ((fflags & FO_POSITION) == 0)
- offset = uio->uio_offset = fp->f_offset;
+ offset = uio->uio_offset = foffset(fp);
else
offset = uio->uio_offset;
@@ -366,8 +364,10 @@ vn_read(struct file *fp, struct uio *uio
goto done;
}
+ KERNEL_LOCK();
error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
cred);
+ KERNEL_UNLOCK();
if ((fflags & FO_POSITION) == 0) {
mtx_enter(&fp->f_mtx);
fp->f_offset += count - uio->uio_resid;
@@ -375,7 +375,6 @@ vn_read(struct file *fp, struct uio *uio
}
done:
VOP_UNLOCK(vp);
- KERNEL_UNLOCK();
return (error);
}
Use foffset() to push the KERNEL_LOCK()