Download raw body.
btrace, identifying syscall+0x5c4
On 02/12/24(Mon) 14:49, Stuart Henderson wrote:
> On 2024/12/02 14:35, Stuart Henderson wrote:
> > On 2024/12/02 15:12, Claudio Jeker wrote:
> > > On Mon, Dec 02, 2024 at 01:53:02PM +0000, Stuart Henderson wrote:
> > > > Thought I'd take a look at a kernel profile from a machine which is
> > > > fairly "interesting" in terms of kernel spin. (Running -current).
> > > >
> > > > In the attached output from running "btrace kprofile.bt" I have a
> > > > lot of time accounted for under syscall+0x5c4, which has _kernel_lock
> > > > directly next to it in the traces, and I was wondering what that is.
> > > > Bit confused by disasm of trap.o but my guess is that it might be
> > > > a lock relating to dt(4) itself, is that right?
> > >
> > > I think this is the bit of syscall that grabs the kernel lock for any
> > > syscall not marked NOLOCK. In your case this is mostly fstat*, open*
> > > or the bits under syscall+0x5e7.
> >
> > Thank you.
> >
> > > Your system does a lot of parallel disk IO and that is all still using the
> > > KERNEL_LOCK.
> >
> > oh, I should try "VFS syscalls & KERNEL_LOCK()" on this machine!
>
> ...and fwiw that looks like this:
Interesting. What kind of workload is that? You have the same amount
of contention spread across open(2), fstat(2), read(2) and close(2).
If fstat(2) is done on a pipe, socket or kqueue the updated diff below
might help. It pushes the KERNEL_LOCK() down to vn_statfile() in this
syscall.
Index: kern/syscalls.master
===================================================================
RCS file: /cvs/src/sys/kern/syscalls.master,v
diff -u -p -r1.265 syscalls.master
--- kern/syscalls.master 2 Aug 2024 14:34:45 -0000 1.265
+++ kern/syscalls.master 28 Nov 2024 18:55:36 -0000
@@ -52,7 +52,7 @@
3 STD NOLOCK { ssize_t sys_read(int fd, void *buf, size_t nbyte); }
4 STD NOLOCK { ssize_t sys_write(int fd, const void *buf, \
size_t nbyte); }
-5 STD { int sys_open(const char *path, \
+5 STD NOLOCK { int sys_open(const char *path, \
int flags, ... mode_t mode); }
6 STD NOLOCK { int sys_close(int fd); }
7 STD NOLOCK { int sys_getentropy(void *buf, size_t nbyte); }
@@ -104,11 +104,11 @@
35 STD { int sys_fchflags(int fd, u_int flags); }
36 STD { void sys_sync(void); }
37 OBSOL msyscall
-38 STD { int sys_stat(const char *path, struct stat *ub); }
+38 STD NOLOCK { int sys_stat(const char *path, struct stat *ub); }
39 STD NOLOCK { pid_t sys_getppid(void); }
-40 STD { int sys_lstat(const char *path, struct stat *ub); }
+40 STD NOLOCK { int sys_lstat(const char *path, struct stat *ub); }
41 STD NOLOCK { int sys_dup(int fd); }
-42 STD { int sys_fstatat(int fd, const char *path, \
+42 STD NOLOCK { int sys_fstatat(int fd, const char *path, \
struct stat *buf, int flag); }
43 STD NOLOCK { gid_t sys_getegid(void); }
44 STD { int sys_profil(caddr_t samples, size_t size, \
@@ -133,7 +133,7 @@
51 UNIMPL acct
#endif
52 STD { int sys_sigpending(void); }
-53 STD { int sys_fstat(int fd, struct stat *sb); }
+53 STD NOLOCK { int sys_fstat(int fd, struct stat *sb); }
54 STD NOLOCK { int sys_ioctl(int fd, \
u_long com, ... void *data); }
55 STD { int sys_reboot(int opt); }
@@ -240,7 +240,7 @@
113 UNIMPL fktrace
114 STD { int sys_unveil(const char *path, \
const char *permissions); }
-115 STD { int sys___realpath(const char *pathname, \
+115 STD NOLOCK { int sys___realpath(const char *pathname, \
char *resolved); }
116 STD NOLOCK { int sys_recvmmsg(int s, struct mmsghdr *mmsg, \
unsigned int vlen, int flags, \
@@ -553,7 +553,7 @@
mode_t mode); }
320 STD { int sys_mknodat(int fd, const char *path, \
mode_t mode, dev_t dev); }
-321 STD { int sys_openat(int fd, const char *path, int flags, \
+321 STD NOLOCK { int sys_openat(int fd, const char *path, int flags, \
... mode_t mode); }
322 STD { ssize_t sys_readlinkat(int fd, const char *path, \
char *buf, size_t count); }
Index: kern/vfs_syscalls.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v
diff -u -p -r1.370 vfs_syscalls.c
--- kern/vfs_syscalls.c 5 Nov 2024 06:03:19 -0000 1.370
+++ kern/vfs_syscalls.c 28 Nov 2024 18:19:24 -0000
@@ -894,8 +894,10 @@ sys___realpath(struct proc *p, void *v,
bp = &cwdbuf[cwdlen - 1];
*bp = '\0';
+ KERNEL_LOCK();
error = vfs_getcwd_common(p->p_fd->fd_cdir, NULL, &bp, cwdbuf,
cwdlen/2, GETCWD_CHECK_ACCESS, p);
+ KERNEL_UNLOCK();
if (error) {
free(cwdbuf, M_TEMP, cwdlen);
@@ -919,12 +921,16 @@ sys___realpath(struct proc *p, void *v,
nd.ni_pledge = PLEDGE_RPATH;
nd.ni_unveil = UNVEIL_READ;
- if ((error = namei(&nd)) != 0)
+ KERNEL_LOCK();
+ if ((error = namei(&nd)) != 0) {
+ KERNEL_UNLOCK();
goto end;
+ }
/* release reference from namei */
if (nd.ni_vp)
vrele(nd.ni_vp);
+ KERNEL_UNLOCK();
error = copyoutstr(nd.ni_cnd.cn_rpbuf, SCARG(uap, resolved),
MAXPATHLEN, NULL);
@@ -1120,23 +1126,20 @@ doopenat(struct proc *p, int fd, const c
localtrunc = 1;
flags &= ~O_TRUNC; /* Must do truncate ourselves */
}
+ KERNEL_LOCK();
if ((error = vn_open(&nd, flags, cmode)) != 0) {
fdplock(fdp);
if (error == ENODEV &&
p->p_dupfd >= 0 && /* XXX from fdopen */
(error =
dupfdopen(p, indx, flags)) == 0) {
- fdpunlock(fdp);
- closef(fp, p);
*retval = indx;
- return (error);
+ goto error;
}
if (error == ERESTART)
error = EINTR;
fdremove(fdp, indx);
- fdpunlock(fdp);
- closef(fp, p);
- return (error);
+ goto error;
}
p->p_dupfd = 0;
vp = nd.ni_vp;
@@ -1161,9 +1164,7 @@ doopenat(struct proc *p, int fd, const c
fdplock(fdp);
/* closef will vn_close the file for us. */
fdremove(fdp, indx);
- fdpunlock(fdp);
- closef(fp, p);
- return (error);
+ goto error;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
atomic_setbits_int(&fp->f_iflags, FIF_HASLOCK);
@@ -1185,18 +1186,22 @@ doopenat(struct proc *p, int fd, const c
fdplock(fdp);
/* closef will close the file for us. */
fdremove(fdp, indx);
- fdpunlock(fdp);
- closef(fp, p);
- return (error);
+ goto error;
}
}
VOP_UNLOCK(vp);
+ KERNEL_UNLOCK();
*retval = indx;
fdplock(fdp);
fdinsert(fdp, indx, cloexec, fp);
fdpunlock(fdp);
FRELE(fp, p);
return (error);
+error:
+ KERNEL_UNLOCK();
+ fdpunlock(fdp);
+ closef(fp, p);
+ return (error);
}
/*
@@ -2066,10 +2071,14 @@ dofstatat(struct proc *p, int fd, const
NDINITAT(&nd, LOOKUP, follow | LOCKLEAF, UIO_USERSPACE, fd, path, p);
nd.ni_pledge = PLEDGE_RPATH;
nd.ni_unveil = UNVEIL_READ;
- if ((error = namei(&nd)) != 0)
+ KERNEL_LOCK();
+ if ((error = namei(&nd)) != 0) {
+ KERNEL_UNLOCK();
return (error);
+ }
error = vn_stat(nd.ni_vp, &sb, p);
vput(nd.ni_vp);
+ KERNEL_UNLOCK();
if (error)
return (error);
/* Don't let non-root see generation numbers (for NFS security) */
Index: kern/vfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_vnops.c,v
diff -u -p -r1.123 vfs_vnops.c
--- kern/vfs_vnops.c 5 Nov 2024 06:03:19 -0000 1.123
+++ kern/vfs_vnops.c 2 Dec 2024 16:42:23 -0000
@@ -427,7 +427,13 @@ int
vn_statfile(struct file *fp, struct stat *sb, struct proc *p)
{
struct vnode *vp = fp->f_data;
- return vn_stat(vp, sb, p);
+ int error;
+
+ KERNEL_LOCK();
+ error = vn_stat(vp, sb, p);
+ KERNEL_UNLOCK();
+
+ return (error);
}
/*
btrace, identifying syscall+0x5c4